crypto: gcm - fix cacheline sharing
diff mbox series

Message ID 1559149856-7938-1-git-send-email-iuliana.prodan@nxp.com
State New, archived
Headers show
Series
  • crypto: gcm - fix cacheline sharing
Related show

Commit Message

Iuliana Prodan May 29, 2019, 5:10 p.m. UTC
The generic GCM driver should ensure that whatever it passes into
scatterlists is safe for non-cache coherent DMA.
The issue was seen while running GCM on CAAM driver. But, since CAAM
does not support GHASH on i.MX6, only CTR skcipher part of the GCM is
offloaded.
The skcipher request received by CAAM has req->src pointing to
auth_tag[16] and req->iv pointing to iv[16]. Problem is that when
the iv is updated (crypto API requires skcipher implementations to
update the IV with the last ciphertext block) is written in iv[16],
which is on the same cacheline as auth_tag[16] that was previously
DMA mapped.
Solution is to use a pointer, aligned to cache line, instead of auth_tag
buffer, for encryption/decryption and then free it on completion.

Link: https://lore.kernel.org/linux-crypto/20190208114459.5nixe76xmmkhur75@gondor.apana.org.au/
Cc: <stable@vger.kernel.org> # v4.19+
Fixes: adcbc688fe2f ("crypto: gcm - Convert to new AEAD interface")
Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>

---
I've checked the reproducibility of this issue starting with 4.19.y.
---
 crypto/gcm.c         | 26 +++++++++++++++++---------
 include/crypto/gcm.h |  1 +
 2 files changed, 18 insertions(+), 9 deletions(-)

Comments

Eric Biggers May 29, 2019, 8:27 p.m. UTC | #1
On Wed, May 29, 2019 at 08:10:56PM +0300, Iuliana Prodan wrote:
> The generic GCM driver should ensure that whatever it passes into
> scatterlists is safe for non-cache coherent DMA.
> The issue was seen while running GCM on CAAM driver. But, since CAAM
> does not support GHASH on i.MX6, only CTR skcipher part of the GCM is
> offloaded.
> The skcipher request received by CAAM has req->src pointing to
> auth_tag[16] and req->iv pointing to iv[16]. Problem is that when
> the iv is updated (crypto API requires skcipher implementations to
> update the IV with the last ciphertext block) is written in iv[16],
> which is on the same cacheline as auth_tag[16] that was previously
> DMA mapped.
> Solution is to use a pointer, aligned to cache line, instead of auth_tag
> buffer, for encryption/decryption and then free it on completion.
> 
> Link: https://lore.kernel.org/linux-crypto/20190208114459.5nixe76xmmkhur75@gondor.apana.org.au/
> Cc: <stable@vger.kernel.org> # v4.19+
> Fixes: adcbc688fe2f ("crypto: gcm - Convert to new AEAD interface")
> Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
> 
> ---
> I've checked the reproducibility of this issue starting with 4.19.y.
> ---
>  crypto/gcm.c         | 26 +++++++++++++++++---------
>  include/crypto/gcm.h |  1 +
>  2 files changed, 18 insertions(+), 9 deletions(-)
> 
> diff --git a/crypto/gcm.c b/crypto/gcm.c
> index 33f45a9..53e3ce5 100644
> --- a/crypto/gcm.c
> +++ b/crypto/gcm.c
> @@ -66,7 +66,7 @@ struct crypto_gcm_ghash_ctx {
>  
>  struct crypto_gcm_req_priv_ctx {
>  	u8 iv[16];
> -	u8 auth_tag[16];
> +	u8 *auth_tag;
>  	u8 iauth_tag[16];
>  	struct scatterlist src[3];
>  	struct scatterlist dst[3];
> @@ -177,19 +177,23 @@ static void crypto_gcm_init_common(struct aead_request *req)
>  	__be32 counter = cpu_to_be32(1);
>  	struct scatterlist *sg;
>  
> -	memset(pctx->auth_tag, 0, sizeof(pctx->auth_tag));
> +	/*
> +	 * kzalloc alignment is at least the cache line size
> +	 * for non-cache coherent architectures.
> +	 */
> +	pctx->auth_tag = kzalloc(GCM_MAX_AUTH_SIZE, GFP_KERNEL);
>  	memcpy(pctx->iv, req->iv, GCM_AES_IV_SIZE);
>  	memcpy(pctx->iv + GCM_AES_IV_SIZE, &counter, 4);
>  
>  	sg_init_table(pctx->src, 3);
> -	sg_set_buf(pctx->src, pctx->auth_tag, sizeof(pctx->auth_tag));
> +	sg_set_buf(pctx->src, pctx->auth_tag, GCM_MAX_AUTH_SIZE);
>  	sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen);
>  	if (sg != pctx->src + 1)
>  		sg_chain(pctx->src, 2, sg);
>  
>  	if (req->src != req->dst) {
>  		sg_init_table(pctx->dst, 3);
> -		sg_set_buf(pctx->dst, pctx->auth_tag, sizeof(pctx->auth_tag));
> +		sg_set_buf(pctx->dst, pctx->auth_tag, GCM_MAX_AUTH_SIZE);
>  		sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen);
>  		if (sg != pctx->dst + 1)
>  			sg_chain(pctx->dst, 2, sg);
> @@ -208,9 +212,8 @@ static void crypto_gcm_init_crypt(struct aead_request *req,
>  	dst = req->src == req->dst ? pctx->src : pctx->dst;
>  
>  	skcipher_request_set_tfm(skreq, ctx->ctr);
> -	skcipher_request_set_crypt(skreq, pctx->src, dst,
> -				     cryptlen + sizeof(pctx->auth_tag),
> -				     pctx->iv);
> +	skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen +
> +				   GCM_MAX_AUTH_SIZE, pctx->iv);
>  }
>  
>  static inline unsigned int gcm_remain(unsigned int len)
> @@ -440,6 +443,7 @@ static int gcm_enc_copy_hash(struct aead_request *req, u32 flags)
>  	scatterwalk_map_and_copy(auth_tag, req->dst,
>  				 req->assoclen + req->cryptlen,
>  				 crypto_aead_authsize(aead), 1);
> +	kfree(auth_tag);
>  	return 0;
>  }
>  
> @@ -492,11 +496,15 @@ static int crypto_gcm_verify(struct aead_request *req)
>  	u8 *iauth_tag = pctx->iauth_tag;
>  	unsigned int authsize = crypto_aead_authsize(aead);
>  	unsigned int cryptlen = req->cryptlen - authsize;
> +	int err;
>  
>  	crypto_xor(auth_tag, iauth_tag, 16);
>  	scatterwalk_map_and_copy(iauth_tag, req->src,
>  				 req->assoclen + cryptlen, authsize, 0);
> -	return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
> +	err = crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
> +	kfree(auth_tag);
> +
> +	return err;
>  }
>  

So what about the other places that also pass an IV located next to the data,
like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
new API requirement, then we need to add a debugging option that makes the API
detect this violation so that the other places can be fixed too.

Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
fact there are at least 3 bugs here: (1) not checking the return value, (2)
incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
freeing the memory.  Why not use cacheline-aligned memory within the request
context, so that a separate kmalloc() isn't needed?

Also, did you consider whether there's any way to make the crypto API handle
this automatically, so that all the individual users don't have to?

- Eric
Ard Biesheuvel May 29, 2019, 10:16 p.m. UTC | #2
On Wed, 29 May 2019 at 22:27, Eric Biggers <ebiggers@kernel.org> wrote:
>
> On Wed, May 29, 2019 at 08:10:56PM +0300, Iuliana Prodan wrote:
> > The generic GCM driver should ensure that whatever it passes into
> > scatterlists is safe for non-cache coherent DMA.
> > The issue was seen while running GCM on CAAM driver. But, since CAAM
> > does not support GHASH on i.MX6, only CTR skcipher part of the GCM is
> > offloaded.
> > The skcipher request received by CAAM has req->src pointing to
> > auth_tag[16] and req->iv pointing to iv[16]. Problem is that when
> > the iv is updated (crypto API requires skcipher implementations to
> > update the IV with the last ciphertext block) is written in iv[16],
> > which is on the same cacheline as auth_tag[16] that was previously
> > DMA mapped.
> > Solution is to use a pointer, aligned to cache line, instead of auth_tag
> > buffer, for encryption/decryption and then free it on completion.
> >
> > Link: https://lore.kernel.org/linux-crypto/20190208114459.5nixe76xmmkhur75@gondor.apana.org.au/
> > Cc: <stable@vger.kernel.org> # v4.19+
> > Fixes: adcbc688fe2f ("crypto: gcm - Convert to new AEAD interface")
> > Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> > Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
> >
...
> So what about the other places that also pass an IV located next to the data,
> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
> new API requirement, then we need to add a debugging option that makes the API
> detect this violation so that the other places can be fixed too.
>
> Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
> fact there are at least 3 bugs here: (1) not checking the return value, (2)
> incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
> freeing the memory.  Why not use cacheline-aligned memory within the request
> context, so that a separate kmalloc() isn't needed?
>
> Also, did you consider whether there's any way to make the crypto API handle
> this automatically, so that all the individual users don't have to?
>

Reading back that old thread, it appears that the core issue is that
the IV is copied when the scatterlist is already mapped for DMA. This
means the cacheline covering the IV and the auth tag is dirty while
the non-coherent DMA transaction takes place, and given that we clean
rather than invalidate the start and end of DMA mappings if they are
not aligned to the cache writeback granule size, whatever sits in the
cacheline overwrites whatever the device wrote in there.

Iuliana, did you try pulling the IV copy forward? I.e.,

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index c0ece44f303b..11e91c0c9a96 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1835,11 +1835,6 @@ static int skcipher_decrypt(struct skcipher_request *req)
        u32 *desc;
        int ret = 0;

-       /* allocate extended descriptor */
-       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
-       if (IS_ERR(edesc))
-               return PTR_ERR(edesc);
-
        /*
         * The crypto API expects us to set the IV (req->iv) to the last
         * ciphertext block.
@@ -1848,6 +1843,11 @@ static int skcipher_decrypt(struct skcipher_request *req)
                scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
                                         ivsize, ivsize, 0);

+       /* allocate extended descriptor */
+       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
+       if (IS_ERR(edesc))
+               return PTR_ERR(edesc);
+
        /* Create and submit job descriptor*/
        init_skcipher_job(req, edesc, false);
        desc = edesc->hw_desc;

This should ensure that the cacheline is cleaned when the DMA mapping
is created.
Herbert Xu May 30, 2019, 5:34 a.m. UTC | #3
On Wed, May 29, 2019 at 01:27:28PM -0700, Eric Biggers wrote:
>
> So what about the other places that also pass an IV located next to the data,
> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
> new API requirement, then we need to add a debugging option that makes the API
> detect this violation so that the other places can be fixed too.
> 
> Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
> fact there are at least 3 bugs here: (1) not checking the return value, (2)
> incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
> freeing the memory.  Why not use cacheline-aligned memory within the request
> context, so that a separate kmalloc() isn't needed?
> 
> Also, did you consider whether there's any way to make the crypto API handle
> this automatically, so that all the individual users don't have to?

You're absolutely right Eric.

What I suggested in the old thread is non-sense.  While you can
force GCM to provide the right pointers you cannot force all the
other crypto API users to do this.

It would appear that Ard's latest suggestion should fix the problem
and is the correct approach.

Thanks,
Horia Geanta May 30, 2019, 7:46 a.m. UTC | #4
On 5/30/2019 8:34 AM, Herbert Xu wrote:
> On Wed, May 29, 2019 at 01:27:28PM -0700, Eric Biggers wrote:
>>
>> So what about the other places that also pass an IV located next to the data,
>> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
Fix for ccm is WIP.
We were not aware of adiantum since our crypto engine does not accelerate it.

>> new API requirement, then we need to add a debugging option that makes the API
>> detect this violation so that the other places can be fixed too.
>>
IMO this is not a new crypto API requirement.
crypto API and its users must follow DMA API rules, besides crypto-specific ones.

In this particular case, crypto/gcm.c is both an implementation and a crypto API
user, since it uses underneath ctr(aes) (and ghash).
Currently generic gcm implementation is breaking DMA API, since part of the dst
buffer (auth_tag) provided to ctr(aes) is sharing a cache line with some other
data structure (iv).

The DMA API rule is mentioned in Documentation/DMA-API.txt

.. warning::

        Memory coherency operates at a granularity called the cache
        line width.  In order for memory mapped by this API to operate
        correctly, the mapped region must begin exactly on a cache line
        boundary and end exactly on one (to prevent two separately mapped
        regions from sharing a single cache line).


>> Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
>> fact there are at least 3 bugs here: (1) not checking the return value, (2)
>> incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
For (2) I assume this means checking for CRYPTO_TFM_REQ_MAY_SLEEP flag.

>> freeing the memory.  Why not use cacheline-aligned memory within the request
>> context, so that a separate kmalloc() isn't needed?
>>
If you check previous discussion referenced in the commit message:
Link:
https://lore.kernel.org/linux-crypto/20190208114459.5nixe76xmmkhur75@gondor.apana.org.au/

or (probably easier) look at the full thread:
https://patchwork.kernel.org/patch/10789697/

you'll see that at some point I proposed changing crypto_gcm_req_priv_ctx struct
as follows:
-       u8 auth_tag[16];
+       u8 auth_tag[16] ____cacheline_aligned;

Ard suggested it would be better to kmalloc the auth_tag.

I am open to changing the fix, however I don't think the problem is in the
implementation (caam driver).

>> Also, did you consider whether there's any way to make the crypto API handle
>> this automatically, so that all the individual users don't have to?
That would probably work, but I guess it would come up with a big overhead.

I am thinking crypto API would have to check each buffer used by src, dst
scatterlists is correctly aligned - starting and ending on cache line boundaries.

> 
> You're absolutely right Eric.
> 
> What I suggested in the old thread is non-sense.  While you can
> force GCM to provide the right pointers you cannot force all the
> other crypto API users to do this.
> 
Whose problem is that crypto API users don't follow the DMA API requirements?

> It would appear that Ard's latest suggestion should fix the problem
> and is the correct approach.
> 
I disagree.
We shouldn't force crypto implementations to be aware of such inconsistencies in
the I/O data buffers (pointed to by src/dst scatterlists) that are supposed to
be safely DMA mapped.

Thanks,
Horia
Ard Biesheuvel May 30, 2019, 8:08 a.m. UTC | #5
On Thu, 30 May 2019 at 09:46, Horia Geanta <horia.geanta@nxp.com> wrote:
>
> On 5/30/2019 8:34 AM, Herbert Xu wrote:
> > On Wed, May 29, 2019 at 01:27:28PM -0700, Eric Biggers wrote:
> >>
> >> So what about the other places that also pass an IV located next to the data,
> >> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
> Fix for ccm is WIP.
> We were not aware of adiantum since our crypto engine does not accelerate it.
>
> >> new API requirement, then we need to add a debugging option that makes the API
> >> detect this violation so that the other places can be fixed too.
> >>
> IMO this is not a new crypto API requirement.
> crypto API and its users must follow DMA API rules, besides crypto-specific ones.
>
> In this particular case, crypto/gcm.c is both an implementation and a crypto API
> user, since it uses underneath ctr(aes) (and ghash).
> Currently generic gcm implementation is breaking DMA API, since part of the dst
> buffer (auth_tag) provided to ctr(aes) is sharing a cache line with some other
> data structure (iv).
>
> The DMA API rule is mentioned in Documentation/DMA-API.txt
>
> .. warning::
>
>         Memory coherency operates at a granularity called the cache
>         line width.  In order for memory mapped by this API to operate
>         correctly, the mapped region must begin exactly on a cache line
>         boundary and end exactly on one (to prevent two separately mapped
>         regions from sharing a single cache line).
>
>

This is overly restrictive, and not in line with reality. The whole
networking stack operates on buffers shifted by 2 bytes if
NET_IP_ALIGN is left at its default value of 2. There are numerous
examples in other places as well.

Given that kmalloc() will take the cacheline granularity into account
if necessary, the only way this issue can hit is when a single kmalloc
buffer is written to by two different masters.

> >> Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
> >> fact there are at least 3 bugs here: (1) not checking the return value, (2)
> >> incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
> For (2) I assume this means checking for CRYPTO_TFM_REQ_MAY_SLEEP flag.
>
> >> freeing the memory.  Why not use cacheline-aligned memory within the request
> >> context, so that a separate kmalloc() isn't needed?
> >>
> If you check previous discussion referenced in the commit message:
> Link:
> https://lore.kernel.org/linux-crypto/20190208114459.5nixe76xmmkhur75@gondor.apana.org.au/
>
> or (probably easier) look at the full thread:
> https://patchwork.kernel.org/patch/10789697/
>
> you'll see that at some point I proposed changing crypto_gcm_req_priv_ctx struct
> as follows:
> -       u8 auth_tag[16];
> +       u8 auth_tag[16] ____cacheline_aligned;
>
> Ard suggested it would be better to kmalloc the auth_tag.
>
> I am open to changing the fix, however I don't think the problem is in the
> implementation (caam driver).
>

I remember that. But in the discussion that followed, I did ask about
accessing the memory while the buffer is mapped for DMA, and I
misunderstood the response. The scatterwalk_map_and_copy writes to the
request while it is mapped for DMA.

> >> Also, did you consider whether there's any way to make the crypto API handle
> >> this automatically, so that all the individual users don't have to?
> That would probably work, but I guess it would come up with a big overhead.
>
> I am thinking crypto API would have to check each buffer used by src, dst
> scatterlists is correctly aligned - starting and ending on cache line boundaries.
>
> >
> > You're absolutely right Eric.
> >
> > What I suggested in the old thread is non-sense.  While you can
> > force GCM to provide the right pointers you cannot force all the
> > other crypto API users to do this.
> >
> Whose problem is that crypto API users don't follow the DMA API requirements?
>
> > It would appear that Ard's latest suggestion should fix the problem
> > and is the correct approach.
> >
> I disagree.
> We shouldn't force crypto implementations to be aware of such inconsistencies in
> the I/O data buffers (pointed to by src/dst scatterlists) that are supposed to
> be safely DMA mapped.
>

I'm on the fence here. On the one hand, it is slightly dodgy for the
GCM driver to pass a scatterlist referencing a buffer that shares a
cacheline with another buffer passed by an ordinary pointer, and for
which an explicit requirement exists that the callee should update it
before returning.

On the other hand, I think it is reasonable to require drivers not to
perform such updates while the scatterlist is mapped for DMA, since
fixing it in the callers puts a disproportionate burden on them, given
that non-coherent DMA only represents a small minority of use cases.
Horia Geanta May 30, 2019, 1:18 p.m. UTC | #6
On 5/30/2019 11:08 AM, Ard Biesheuvel wrote:
> On Thu, 30 May 2019 at 09:46, Horia Geanta <horia.geanta@nxp.com> wrote:
>>
>> On 5/30/2019 8:34 AM, Herbert Xu wrote:
>>> On Wed, May 29, 2019 at 01:27:28PM -0700, Eric Biggers wrote:
>>>>
>>>> So what about the other places that also pass an IV located next to the data,
>>>> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
>> Fix for ccm is WIP.
>> We were not aware of adiantum since our crypto engine does not accelerate it.
>>
>>>> new API requirement, then we need to add a debugging option that makes the API
>>>> detect this violation so that the other places can be fixed too.
>>>>
>> IMO this is not a new crypto API requirement.
>> crypto API and its users must follow DMA API rules, besides crypto-specific ones.
>>
>> In this particular case, crypto/gcm.c is both an implementation and a crypto API
>> user, since it uses underneath ctr(aes) (and ghash).
>> Currently generic gcm implementation is breaking DMA API, since part of the dst
>> buffer (auth_tag) provided to ctr(aes) is sharing a cache line with some other
>> data structure (iv).
>>
>> The DMA API rule is mentioned in Documentation/DMA-API.txt
>>
>> .. warning::
>>
>>         Memory coherency operates at a granularity called the cache
>>         line width.  In order for memory mapped by this API to operate
>>         correctly, the mapped region must begin exactly on a cache line
>>         boundary and end exactly on one (to prevent two separately mapped
>>         regions from sharing a single cache line).
>>
>>
> 
> This is overly restrictive, and not in line with reality. The whole
> networking stack operates on buffers shifted by 2 bytes if
> NET_IP_ALIGN is left at its default value of 2. There are numerous
> examples in other places as well.
> 
> Given that kmalloc() will take the cacheline granularity into account
> if necessary, the only way this issue can hit is when a single kmalloc
> buffer is written to by two different masters.
> 
I guess there are only two options:
-either cache line sharing is avoided OR
-users need to be *aware* they are sharing the cache line and some rules /
assumptions are in place on how to safely work on the data

What you are probably saying is that 2nd option is sometimes the way to go.

>>>> Also, did you consider whether there's any way to make the crypto API handle
>>>> this automatically, so that all the individual users don't have to?
>> That would probably work, but I guess it would come up with a big overhead.
>>
>> I am thinking crypto API would have to check each buffer used by src, dst
>> scatterlists is correctly aligned - starting and ending on cache line boundaries.
>>
>>>
>>> You're absolutely right Eric.
>>>
>>> What I suggested in the old thread is non-sense.  While you can
>>> force GCM to provide the right pointers you cannot force all the
>>> other crypto API users to do this.
>>>
>> Whose problem is that crypto API users don't follow the DMA API requirements?
>>
>>> It would appear that Ard's latest suggestion should fix the problem
>>> and is the correct approach.
>>>
>> I disagree.
>> We shouldn't force crypto implementations to be aware of such inconsistencies in
>> the I/O data buffers (pointed to by src/dst scatterlists) that are supposed to
>> be safely DMA mapped.
>>
> 
> I'm on the fence here. On the one hand, it is slightly dodgy for the
> GCM driver to pass a scatterlist referencing a buffer that shares a
> cacheline with another buffer passed by an ordinary pointer, and for
> which an explicit requirement exists that the callee should update it
> before returning.
> 
> On the other hand, I think it is reasonable to require drivers not to
> perform such updates while the scatterlist is mapped for DMA, since
> fixing it in the callers puts a disproportionate burden on them, given
> that non-coherent DMA only represents a small minority of use cases.
> 
The problem with this approach is that the buffers in the scatterlist could
hypothetically share cache lines with *any* other CPU-updated data, not just the
IV in the crypto request (as it happens here).
How could a non-coherent DMA implementation cope with this?

Thanks,
Horia
Ard Biesheuvel May 30, 2019, 1:25 p.m. UTC | #7
On Thu, 30 May 2019 at 15:18, Horia Geanta <horia.geanta@nxp.com> wrote:
>
> On 5/30/2019 11:08 AM, Ard Biesheuvel wrote:
> > On Thu, 30 May 2019 at 09:46, Horia Geanta <horia.geanta@nxp.com> wrote:
> >>
> >> On 5/30/2019 8:34 AM, Herbert Xu wrote:
> >>> On Wed, May 29, 2019 at 01:27:28PM -0700, Eric Biggers wrote:
> >>>>
> >>>> So what about the other places that also pass an IV located next to the data,
> >>>> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
> >> Fix for ccm is WIP.
> >> We were not aware of adiantum since our crypto engine does not accelerate it.
> >>
> >>>> new API requirement, then we need to add a debugging option that makes the API
> >>>> detect this violation so that the other places can be fixed too.
> >>>>
> >> IMO this is not a new crypto API requirement.
> >> crypto API and its users must follow DMA API rules, besides crypto-specific ones.
> >>
> >> In this particular case, crypto/gcm.c is both an implementation and a crypto API
> >> user, since it uses underneath ctr(aes) (and ghash).
> >> Currently generic gcm implementation is breaking DMA API, since part of the dst
> >> buffer (auth_tag) provided to ctr(aes) is sharing a cache line with some other
> >> data structure (iv).
> >>
> >> The DMA API rule is mentioned in Documentation/DMA-API.txt
> >>
> >> .. warning::
> >>
> >>         Memory coherency operates at a granularity called the cache
> >>         line width.  In order for memory mapped by this API to operate
> >>         correctly, the mapped region must begin exactly on a cache line
> >>         boundary and end exactly on one (to prevent two separately mapped
> >>         regions from sharing a single cache line).
> >>
> >>
> >
> > This is overly restrictive, and not in line with reality. The whole
> > networking stack operates on buffers shifted by 2 bytes if
> > NET_IP_ALIGN is left at its default value of 2. There are numerous
> > examples in other places as well.
> >
> > Given that kmalloc() will take the cacheline granularity into account
> > if necessary, the only way this issue can hit is when a single kmalloc
> > buffer is written to by two different masters.
> >
> I guess there are only two options:
> -either cache line sharing is avoided OR
> -users need to be *aware* they are sharing the cache line and some rules /
> assumptions are in place on how to safely work on the data
>
> What you are probably saying is that 2nd option is sometimes the way to go.
>
> >>>> Also, did you consider whether there's any way to make the crypto API handle
> >>>> this automatically, so that all the individual users don't have to?
> >> That would probably work, but I guess it would come up with a big overhead.
> >>
> >> I am thinking crypto API would have to check each buffer used by src, dst
> >> scatterlists is correctly aligned - starting and ending on cache line boundaries.
> >>
> >>>
> >>> You're absolutely right Eric.
> >>>
> >>> What I suggested in the old thread is non-sense.  While you can
> >>> force GCM to provide the right pointers you cannot force all the
> >>> other crypto API users to do this.
> >>>
> >> Whose problem is that crypto API users don't follow the DMA API requirements?
> >>
> >>> It would appear that Ard's latest suggestion should fix the problem
> >>> and is the correct approach.
> >>>
> >> I disagree.
> >> We shouldn't force crypto implementations to be aware of such inconsistencies in
> >> the I/O data buffers (pointed to by src/dst scatterlists) that are supposed to
> >> be safely DMA mapped.
> >>
> >
> > I'm on the fence here. On the one hand, it is slightly dodgy for the
> > GCM driver to pass a scatterlist referencing a buffer that shares a
> > cacheline with another buffer passed by an ordinary pointer, and for
> > which an explicit requirement exists that the callee should update it
> > before returning.
> >
> > On the other hand, I think it is reasonable to require drivers not to
> > perform such updates while the scatterlist is mapped for DMA, since
> > fixing it in the callers puts a disproportionate burden on them, given
> > that non-coherent DMA only represents a small minority of use cases.
> >
> The problem with this approach is that the buffers in the scatterlist could
> hypothetically share cache lines with *any* other CPU-updated data, not just the
> IV in the crypto request (as it happens here).
> How could a non-coherent DMA implementation cope with this?
>

I don't think the situation is that bad. We only support allocations
in the linear map for DMA/scatterlists, and buffers in the linear map
can only share a cacheline if they were allocated using the same
kmalloc() invocation (on systems that support non-coherent DMA)

That does mean that it is actually more straightforward to deal with
this at the level where the allocation occurs, and that would justify
dealing with it in the callers.

However, from the callee's point of view, it simply means that you
should not dereference any request struct pointers for writing while
the 'dst' scatterlist is mapped for DMA.

As I said, I am on the fence. I think there are arguments for both sides ...
Herbert Xu May 30, 2019, 1:26 p.m. UTC | #8
On Thu, May 30, 2019 at 01:18:34PM +0000, Horia Geanta wrote:
>
> I guess there are only two options:
> -either cache line sharing is avoided OR
> -users need to be *aware* they are sharing the cache line and some rules /
> assumptions are in place on how to safely work on the data

No there is a third option and it's very simple:

You must only access the virtual addresses given to you before DMA
mapping or after DMA unmapping.

Cheers,
Iuliana Prodan May 30, 2019, 1:29 p.m. UTC | #9
On 5/30/2019 1:16 AM, Ard Biesheuvel wrote:
> On Wed, 29 May 2019 at 22:27, Eric Biggers <ebiggers@kernel.org> wrote:
>>
>> On Wed, May 29, 2019 at 08:10:56PM +0300, Iuliana Prodan wrote:
>>> The generic GCM driver should ensure that whatever it passes into
>>> scatterlists is safe for non-cache coherent DMA.
>>> The issue was seen while running GCM on CAAM driver. But, since CAAM
>>> does not support GHASH on i.MX6, only CTR skcipher part of the GCM is
>>> offloaded.
>>> The skcipher request received by CAAM has req->src pointing to
>>> auth_tag[16] and req->iv pointing to iv[16]. Problem is that when
>>> the iv is updated (crypto API requires skcipher implementations to
>>> update the IV with the last ciphertext block) is written in iv[16],
>>> which is on the same cacheline as auth_tag[16] that was previously
>>> DMA mapped.
>>> Solution is to use a pointer, aligned to cache line, instead of auth_tag
>>> buffer, for encryption/decryption and then free it on completion.
>>>
>>> Link: https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2Flinux-crypto%2F20190208114459.5nixe76xmmkhur75%40gondor.apana.org.au%2F&amp;data=02%7C01%7Ciuliana.prodan%40nxp.com%7C8426e8db85774c4e829308d6e4834cf6%7C686ea1d3bc2b4c6fa92cd99c5c301635%7C0%7C0%7C636947649914200400&amp;sdata=Fq%2F83lofKoEnz7HnzY1jjU1jUi0b2QmVZPfZrzWXtrk%3D&amp;reserved=0
>>> Cc: <stable@vger.kernel.org> # v4.19+
>>> Fixes: adcbc688fe2f ("crypto: gcm - Convert to new AEAD interface")
>>> Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
>>> Signed-off-by: Iuliana Prodan <iuliana.prodan@nxp.com>
>>>
> ...
>> So what about the other places that also pass an IV located next to the data,
>> like crypto/ccm.c and crypto/adiantum.c?  If we're actually going to make this a
>> new API requirement, then we need to add a debugging option that makes the API
>> detect this violation so that the other places can be fixed too.
>>
>> Also, doing a kmalloc() per requset is inefficient and very error-prone.  In
>> fact there are at least 3 bugs here: (1) not checking the return value, (2)
>> incorrectly using GFP_KERNEL when it may be atomic context, and (3) not always
>> freeing the memory.  Why not use cacheline-aligned memory within the request
>> context, so that a separate kmalloc() isn't needed?
>>
>> Also, did you consider whether there's any way to make the crypto API handle
>> this automatically, so that all the individual users don't have to?
>>
> 
> Reading back that old thread, it appears that the core issue is that
> the IV is copied when the scatterlist is already mapped for DMA. This
> means the cacheline covering the IV and the auth tag is dirty while
> the non-coherent DMA transaction takes place, and given that we clean
> rather than invalidate the start and end of DMA mappings if they are
> not aligned to the cache writeback granule size, whatever sits in the
> cacheline overwrites whatever the device wrote in there.
> 
> Iuliana, did you try pulling the IV copy forward? I.e.,

I've tried coping the IV before the extended descriptor allocation, but 
is not working and to make it work will need to make more changes in 
CAAM. We need the original iv, and if we move it before 
skcipher_edesc_alloc we lose it.
The fix exclusively in CAAM drv, to copy iv before DMA map, is more complex.

> 
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index c0ece44f303b..11e91c0c9a96 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -1835,11 +1835,6 @@ static int skcipher_decrypt(struct skcipher_request *req)
>          u32 *desc;
>          int ret = 0;
> 
> -       /* allocate extended descriptor */
> -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> -       if (IS_ERR(edesc))
> -               return PTR_ERR(edesc);
> -
>          /*
>           * The crypto API expects us to set the IV (req->iv) to the last
>           * ciphertext block.
> @@ -1848,6 +1843,11 @@ static int skcipher_decrypt(struct skcipher_request *req)
>                  scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
>                                           ivsize, ivsize, 0);
> 
> +       /* allocate extended descriptor */
> +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> +       if (IS_ERR(edesc))
> +               return PTR_ERR(edesc);
> +
>          /* Create and submit job descriptor*/
>          init_skcipher_job(req, edesc, false);
>          desc = edesc->hw_desc;
> 
> This should ensure that the cacheline is cleaned when the DMA mapping
> is created.
> 

Thanks,
Iulia
Herbert Xu May 30, 2019, 1:34 p.m. UTC | #10
On Thu, May 30, 2019 at 01:29:41PM +0000, Iuliana Prodan wrote:
>
> I've tried coping the IV before the extended descriptor allocation, but 
> is not working and to make it work will need to make more changes in 
> CAAM. We need the original iv, and if we move it before 
> skcipher_edesc_alloc we lose it.
> The fix exclusively in CAAM drv, to copy iv before DMA map, is more complex.

Why doesn't it work (apart from the fact that this only makes sense
for CBC and yet you're doing it for everything including CTR)?

Cheers,
Iuliana Prodan May 30, 2019, 1:45 p.m. UTC | #11
On 5/30/2019 4:34 PM, Herbert Xu wrote:
> On Thu, May 30, 2019 at 01:29:41PM +0000, Iuliana Prodan wrote:
>>
>> I've tried coping the IV before the extended descriptor allocation, but
>> is not working and to make it work will need to make more changes in
>> CAAM. We need the original iv, and if we move it before
>> skcipher_edesc_alloc we lose it.
>> The fix exclusively in CAAM drv, to copy iv before DMA map, is more complex.
> 
> Why doesn't it work (apart from the fact that this only makes sense
> for CBC and yet you're doing it for everything including CTR)?
> 
> Cheers,
> 

On the current structure of caamalg, to work, iv needs to be copied 
before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc function. 
For this we need edesc, but this cannot be allocated before knowing how 
much memory we need. So, to make it work, we'll need to modify more in CAAM.

Thanks,
Iulia
Ard Biesheuvel May 30, 2019, 1:53 p.m. UTC | #12
On Thu, 30 May 2019 at 15:45, Iuliana Prodan <iuliana.prodan@nxp.com> wrote:
>
> On 5/30/2019 4:34 PM, Herbert Xu wrote:
> > On Thu, May 30, 2019 at 01:29:41PM +0000, Iuliana Prodan wrote:
> >>
> >> I've tried coping the IV before the extended descriptor allocation, but
> >> is not working and to make it work will need to make more changes in
> >> CAAM. We need the original iv, and if we move it before
> >> skcipher_edesc_alloc we lose it.
> >> The fix exclusively in CAAM drv, to copy iv before DMA map, is more complex.
> >
> > Why doesn't it work (apart from the fact that this only makes sense
> > for CBC and yet you're doing it for everything including CTR)?
> >
> > Cheers,
> >
>
> On the current structure of caamalg, to work, iv needs to be copied
> before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc function.
> For this we need edesc, but this cannot be allocated before knowing how
> much memory we need. So, to make it work, we'll need to modify more in CAAM.
>

Would this work?

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index c0ece44f303b..2ef2f76a3cb8 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1832,22 +1832,25 @@ static int skcipher_decrypt(struct
skcipher_request *req)
        struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
        int ivsize = crypto_skcipher_ivsize(skcipher);
        struct device *jrdev = ctx->jrdev;
+       u8 out_iv[AES_BLOCK_SIZE];
        u32 *desc;
        int ret = 0;

-       /* allocate extended descriptor */
-       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
-       if (IS_ERR(edesc))
-               return PTR_ERR(edesc);
-
        /*
         * The crypto API expects us to set the IV (req->iv) to the last
         * ciphertext block.
         */
        if (ivsize)
-               scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
+               scatterwalk_map_and_copy(out_iv, req->src, req->cryptlen -
                                         ivsize, ivsize, 0);

+       /* allocate extended descriptor */
+       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
+       if (IS_ERR(edesc))
+               return PTR_ERR(edesc);
+
+       memcpy(req->iv, out_iv, ivsize);
+
        /* Create and submit job descriptor*/
        init_skcipher_job(req, edesc, false);
        desc = edesc->hw_desc;
Ard Biesheuvel May 30, 2019, 1:55 p.m. UTC | #13
On Thu, 30 May 2019 at 15:53, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> On Thu, 30 May 2019 at 15:45, Iuliana Prodan <iuliana.prodan@nxp.com> wrote:
> >
> > On 5/30/2019 4:34 PM, Herbert Xu wrote:
> > > On Thu, May 30, 2019 at 01:29:41PM +0000, Iuliana Prodan wrote:
> > >>
> > >> I've tried coping the IV before the extended descriptor allocation, but
> > >> is not working and to make it work will need to make more changes in
> > >> CAAM. We need the original iv, and if we move it before
> > >> skcipher_edesc_alloc we lose it.
> > >> The fix exclusively in CAAM drv, to copy iv before DMA map, is more complex.
> > >
> > > Why doesn't it work (apart from the fact that this only makes sense
> > > for CBC and yet you're doing it for everything including CTR)?
> > >
> > > Cheers,
> > >
> >
> > On the current structure of caamalg, to work, iv needs to be copied
> > before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc function.
> > For this we need edesc, but this cannot be allocated before knowing how
> > much memory we need. So, to make it work, we'll need to modify more in CAAM.
> >
>
> Would this work?
>
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index c0ece44f303b..2ef2f76a3cb8 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -1832,22 +1832,25 @@ static int skcipher_decrypt(struct
> skcipher_request *req)
>         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
>         int ivsize = crypto_skcipher_ivsize(skcipher);
>         struct device *jrdev = ctx->jrdev;
> +       u8 out_iv[AES_BLOCK_SIZE];
>         u32 *desc;
>         int ret = 0;
>
> -       /* allocate extended descriptor */
> -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> -       if (IS_ERR(edesc))
> -               return PTR_ERR(edesc);
> -
>         /*
>          * The crypto API expects us to set the IV (req->iv) to the last
>          * ciphertext block.
>          */
>         if (ivsize)
> -               scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
> +               scatterwalk_map_and_copy(out_iv, req->src, req->cryptlen -
>                                          ivsize, ivsize, 0);
>
> +       /* allocate extended descriptor */
> +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> +       if (IS_ERR(edesc))
> +               return PTR_ERR(edesc);
> +
> +       memcpy(req->iv, out_iv, ivsize);
> +
>         /* Create and submit job descriptor*/
>         init_skcipher_job(req, edesc, false);
>         desc = edesc->hw_desc;

Umm never mind

/me hides in shame
Herbert Xu May 30, 2019, 1:58 p.m. UTC | #14
On Thu, May 30, 2019 at 01:45:47PM +0000, Iuliana Prodan wrote:
>
> On the current structure of caamalg, to work, iv needs to be copied 
> before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc function. 
> For this we need edesc, but this cannot be allocated before knowing how 
> much memory we need. So, to make it work, we'll need to modify more in CAAM.

All the copying does is:

	if (ivsize)
		scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
					 ivsize, ivsize, 0);

Why do you need to allocate the edesc before doing this?

Cheers,
Ard Biesheuvel May 30, 2019, 2:11 p.m. UTC | #15
On Thu, 30 May 2019 at 15:58, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 01:45:47PM +0000, Iuliana Prodan wrote:
> >
> > On the current structure of caamalg, to work, iv needs to be copied
> > before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc function.
> > For this we need edesc, but this cannot be allocated before knowing how
> > much memory we need. So, to make it work, we'll need to modify more in CAAM.
>
> All the copying does is:
>
>         if (ivsize)
>                 scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
>                                          ivsize, ivsize, 0);
>
> Why do you need to allocate the edesc before doing this?
>

Because that is where the incoming iv is currently consumed. Copying
it out like this wipes the input IV from memory.
Herbert Xu May 30, 2019, 2:27 p.m. UTC | #16
On Thu, May 30, 2019 at 03:55:07PM +0200, Ard Biesheuvel wrote:
>
> > Would this work?

I see.  You need to preserve the original IV.

> > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > index c0ece44f303b..2ef2f76a3cb8 100644
> > --- a/drivers/crypto/caam/caamalg.c
> > +++ b/drivers/crypto/caam/caamalg.c
> > @@ -1832,22 +1832,25 @@ static int skcipher_decrypt(struct
> > skcipher_request *req)
> >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> >         int ivsize = crypto_skcipher_ivsize(skcipher);
> >         struct device *jrdev = ctx->jrdev;
> > +       u8 out_iv[AES_BLOCK_SIZE];
> >         u32 *desc;
> >         int ret = 0;
> >
> > -       /* allocate extended descriptor */
> > -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > -       if (IS_ERR(edesc))
> > -               return PTR_ERR(edesc);
> > -
> >         /*
> >          * The crypto API expects us to set the IV (req->iv) to the last
> >          * ciphertext block.
> >          */
> >         if (ivsize)
> > -               scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
> > +               scatterwalk_map_and_copy(out_iv, req->src, req->cryptlen -
> >                                          ivsize, ivsize, 0);
> >
> > +       /* allocate extended descriptor */
> > +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > +       if (IS_ERR(edesc))
> > +               return PTR_ERR(edesc);
> > +
> > +       memcpy(req->iv, out_iv, ivsize);
> > +
> >         /* Create and submit job descriptor*/
> >         init_skcipher_job(req, edesc, false);
> >         desc = edesc->hw_desc;
> 
> Umm never mind
> 
> /me hides in shame

So why doesn't this work?

Cheers,
Ard Biesheuvel May 30, 2019, 2:28 p.m. UTC | #17
On Thu, 30 May 2019 at 16:27, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 03:55:07PM +0200, Ard Biesheuvel wrote:
> >
> > > Would this work?
>
> I see.  You need to preserve the original IV.
>
> > > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > > index c0ece44f303b..2ef2f76a3cb8 100644
> > > --- a/drivers/crypto/caam/caamalg.c
> > > +++ b/drivers/crypto/caam/caamalg.c
> > > @@ -1832,22 +1832,25 @@ static int skcipher_decrypt(struct
> > > skcipher_request *req)
> > >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > >         int ivsize = crypto_skcipher_ivsize(skcipher);
> > >         struct device *jrdev = ctx->jrdev;
> > > +       u8 out_iv[AES_BLOCK_SIZE];
> > >         u32 *desc;
> > >         int ret = 0;
> > >
> > > -       /* allocate extended descriptor */
> > > -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > > -       if (IS_ERR(edesc))
> > > -               return PTR_ERR(edesc);
> > > -
> > >         /*
> > >          * The crypto API expects us to set the IV (req->iv) to the last
> > >          * ciphertext block.
> > >          */
> > >         if (ivsize)
> > > -               scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
> > > +               scatterwalk_map_and_copy(out_iv, req->src, req->cryptlen -
> > >                                          ivsize, ivsize, 0);
> > >
> > > +       /* allocate extended descriptor */
> > > +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > > +       if (IS_ERR(edesc))
> > > +               return PTR_ERR(edesc);
> > > +
> > > +       memcpy(req->iv, out_iv, ivsize);
> > > +
> > >         /* Create and submit job descriptor*/
> > >         init_skcipher_job(req, edesc, false);
> > >         desc = edesc->hw_desc;
> >
> > Umm never mind
> >
> > /me hides in shame
>
> So why doesn't this work?
>

Because the memcpy() occurs while the buffer is mapped for DMA, so it
suffers from the exact same problem.
Pascal Van Leeuwen May 30, 2019, 2:29 p.m. UTC | #18
> On Thu, 30 May 2019 at 15:58, Herbert Xu <herbert@gondor.apana.org.au>
> wrote:
> >
> > On Thu, May 30, 2019 at 01:45:47PM +0000, Iuliana Prodan wrote:
> > >
> > > On the current structure of caamalg, to work, iv needs to be copied
> > > before memcpy(iv, req->iv, ivsize), from skcipher_edesc_alloc
> function.
> > > For this we need edesc, but this cannot be allocated before knowing
> how
> > > much memory we need. So, to make it work, we'll need to modify more in
> CAAM.
> >
> > All the copying does is:
> >
> >         if (ivsize)
> >                 scatterwalk_map_and_copy(req->iv, req->src, req-
> >cryptlen -
> >                                          ivsize, ivsize, 0);
> >
> > Why do you need to allocate the edesc before doing this?
> >
> 
> Because that is where the incoming iv is currently consumed. Copying
> it out like this wipes the input IV from memory.

I had a similar problem for the Inside Secure driver: for decrypt operations,
you need to copy the output IV from your input data buffer (it's the last 
input data cipher block) but you need to do that *before* you start the
hardware, as for in-place operations, it is overwritten.

At the same time, you cannot store it to req->iv yet, because that still
contains the input IV that you need for processing the first block.

The only solution I could come up with, is to park it in some temporary
buffer in the skcipher_request_ctx *before* starting the hardware and copy 
it to req->iv *after* the operation completes.

Regards,
Pascal van Leeuwen
Silicon IP Architect, Multi-Protocol Engines @ Inside Secure
www.insidesecure.com
Ard Biesheuvel May 30, 2019, 2:31 p.m. UTC | #19
On Thu, 30 May 2019 at 16:28, Ard Biesheuvel <ard.biesheuvel@linaro.org> wrote:
>
> On Thu, 30 May 2019 at 16:27, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> >
> > On Thu, May 30, 2019 at 03:55:07PM +0200, Ard Biesheuvel wrote:
> > >
> > > > Would this work?
> >
> > I see.  You need to preserve the original IV.
> >
> > > > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > > > index c0ece44f303b..2ef2f76a3cb8 100644
> > > > --- a/drivers/crypto/caam/caamalg.c
> > > > +++ b/drivers/crypto/caam/caamalg.c
> > > > @@ -1832,22 +1832,25 @@ static int skcipher_decrypt(struct
> > > > skcipher_request *req)
> > > >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > > >         int ivsize = crypto_skcipher_ivsize(skcipher);
> > > >         struct device *jrdev = ctx->jrdev;
> > > > +       u8 out_iv[AES_BLOCK_SIZE];
> > > >         u32 *desc;
> > > >         int ret = 0;
> > > >
> > > > -       /* allocate extended descriptor */
> > > > -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > > > -       if (IS_ERR(edesc))
> > > > -               return PTR_ERR(edesc);
> > > > -
> > > >         /*
> > > >          * The crypto API expects us to set the IV (req->iv) to the last
> > > >          * ciphertext block.
> > > >          */
> > > >         if (ivsize)
> > > > -               scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
> > > > +               scatterwalk_map_and_copy(out_iv, req->src, req->cryptlen -
> > > >                                          ivsize, ivsize, 0);
> > > >
> > > > +       /* allocate extended descriptor */
> > > > +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > > > +       if (IS_ERR(edesc))
> > > > +               return PTR_ERR(edesc);
> > > > +
> > > > +       memcpy(req->iv, out_iv, ivsize);
> > > > +
> > > >         /* Create and submit job descriptor*/
> > > >         init_skcipher_job(req, edesc, false);
> > > >         desc = edesc->hw_desc;
> > >
> > > Umm never mind
> > >
> > > /me hides in shame
> >
> > So why doesn't this work?
> >
>
> Because the memcpy() occurs while the buffer is mapped for DMA, so it
> suffers from the exact same problem.

This might work:

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index c0ece44f303b..3d313d2a279a 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
  * allocate and map the skcipher extended descriptor for skcipher
  */
 static struct skcipher_edesc *skcipher_edesc_alloc(struct
skcipher_request *req,
-                                                  int desc_bytes)
+                                                  int desc_bytes,
+                                                  u8 const *input_iv)
 {
        struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
        struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
@@ -1745,7 +1746,7 @@ static struct skcipher_edesc
*skcipher_edesc_alloc(struct skcipher_request *req,
        /* Make sure IV is located in a DMAable area */
        if (ivsize) {
                iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
-               memcpy(iv, req->iv, ivsize);
+               memcpy(iv, input_iv, ivsize);

                iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
                if (dma_mapping_error(jrdev, iv_dma)) {
@@ -1801,7 +1802,8 @@ static int skcipher_encrypt(struct skcipher_request *req)
        int ret = 0;

        /* allocate extended descriptor */
-       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
+       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ,
+                                    req->iv);
        if (IS_ERR(edesc))
                return PTR_ERR(edesc);

@@ -1832,13 +1834,11 @@ static int skcipher_decrypt(struct
skcipher_request *req)
        struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
        int ivsize = crypto_skcipher_ivsize(skcipher);
        struct device *jrdev = ctx->jrdev;
+       u8 in_iv[AES_BLOCK_SIZE];
        u32 *desc;
        int ret = 0;

-       /* allocate extended descriptor */
-       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
-       if (IS_ERR(edesc))
-               return PTR_ERR(edesc);
+       memcpy(in_iv, req->iv, ivsize);

        /*
         * The crypto API expects us to set the IV (req->iv) to the last
@@ -1848,6 +1848,11 @@ static int skcipher_decrypt(struct skcipher_request *req)
                scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
                                         ivsize, ivsize, 0);

+       /* allocate extended descriptor */
+       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ, in_iv);
+       if (IS_ERR(edesc))
+               return PTR_ERR(edesc);
+
        /* Create and submit job descriptor*/
        init_skcipher_job(req, edesc, false);
        desc = edesc->hw_desc;
Herbert Xu May 30, 2019, 2:34 p.m. UTC | #20
On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
>
> This might work:

Looks good to me.

Thanks,
Pascal Van Leeuwen May 30, 2019, 2:53 p.m. UTC | #21
> 
> This might work:
> 
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index c0ece44f303b..3d313d2a279a 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
>   * allocate and map the skcipher extended descriptor for skcipher
>   */
>  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> skcipher_request *req,
> -                                                  int desc_bytes)
> +                                                  int desc_bytes,
> +                                                  u8 const *input_iv)
>  {
>         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
>         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> *skcipher_edesc_alloc(struct skcipher_request *req,
>         /* Make sure IV is located in a DMAable area */
>         if (ivsize) {
>                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> -               memcpy(iv, req->iv, ivsize);
> +               memcpy(iv, input_iv, ivsize);
> 
>                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
>                 if (dma_mapping_error(jrdev, iv_dma)) {
> @@ -1801,7 +1802,8 @@ static int skcipher_encrypt(struct skcipher_request
> *req)
>         int ret = 0;
> 
>         /* allocate extended descriptor */
> -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ,
> +                                    req->iv);
>         if (IS_ERR(edesc))
>                 return PTR_ERR(edesc);
> 
> @@ -1832,13 +1834,11 @@ static int skcipher_decrypt(struct
> skcipher_request *req)
>         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
>         int ivsize = crypto_skcipher_ivsize(skcipher);
>         struct device *jrdev = ctx->jrdev;
> +       u8 in_iv[AES_BLOCK_SIZE];
>         u32 *desc;
>         int ret = 0;
> 
> -       /* allocate extended descriptor */
> -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> -       if (IS_ERR(edesc))
> -               return PTR_ERR(edesc);
> +       memcpy(in_iv, req->iv, ivsize);
> 
>         /*
>          * The crypto API expects us to set the IV (req->iv) to the last
> @@ -1848,6 +1848,11 @@ static int skcipher_decrypt(struct skcipher_request
> *req)
>                 scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen
> -
>                                          ivsize, ivsize, 0);
> 
> +       /* allocate extended descriptor */
> +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ,
> in_iv);
> +       if (IS_ERR(edesc))
> +               return PTR_ERR(edesc);
> +
>         /* Create and submit job descriptor*/
>         init_skcipher_job(req, edesc, false);
>         desc = edesc->hw_desc;

Interesting. This discussion made me reevaluate my own implementation.

First thing I realised is that I *am* currently doing the IV copying with
the data buffer mapped for DMA ... If I understand correctly that may be a
bad idea on some systems? I which case I will need to do my copy earlier.

Second thing is the above implementation made me realise I don't need to
buffer the IV as part of the skcipher_request_ctx, I can just copy the
original req->iv to some local variable, pass a pointer to that along and
then overwrite req->iv directly. More elegant and hopefully also more
efficient ...

Regards,
Pascal van Leeuwen
Silicon IP Architect, Multi-Protocol Engines
www.insidesecure.com
Ard Biesheuvel May 30, 2019, 3:04 p.m. UTC | #22
On Thu, 30 May 2019 at 16:34, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
> >
> > This might work:
>
> Looks good to me.
>

Thanks Herbert,

But given your remark regarding CBC being the only algo that has this
requirement, I wonder if this might be sufficient as well.

diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index c0ece44f303b..65b050e3742f 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1844,7 +1844,7 @@ static int skcipher_decrypt(struct skcipher_request *req)
         * The crypto API expects us to set the IV (req->iv) to the last
         * ciphertext block.
         */
-       if (ivsize)
+       if (ctx->cdata.algtype & OP_ALG_AAI_CBC)
                scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
                                         ivsize, ivsize, 0);


Iulia, Horia?
Herbert Xu May 30, 2019, 3:06 p.m. UTC | #23
On Thu, May 30, 2019 at 05:04:51PM +0200, Ard Biesheuvel wrote:
>
> But given your remark regarding CBC being the only algo that has this
> requirement, I wonder if this might be sufficient as well.

It's not that CBC is the only one with the requirement.  It's just
that this is the wrong output IV for CTR.

Cheers,
Ard Biesheuvel May 30, 2019, 3:10 p.m. UTC | #24
On Thu, 30 May 2019 at 17:06, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 05:04:51PM +0200, Ard Biesheuvel wrote:
> >
> > But given your remark regarding CBC being the only algo that has this
> > requirement, I wonder if this might be sufficient as well.
>
> It's not that CBC is the only one with the requirement.  It's just
> that this is the wrong output IV for CTR.
>

Are there any generic templates relying on this for other algos than CBC?
Ard Biesheuvel May 30, 2019, 3:13 p.m. UTC | #25
On Thu, 30 May 2019 at 16:53, Pascal Van Leeuwen
<pvanleeuwen@insidesecure.com> wrote:
>
> >
> > This might work:
> >
> > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > index c0ece44f303b..3d313d2a279a 100644
> > --- a/drivers/crypto/caam/caamalg.c
> > +++ b/drivers/crypto/caam/caamalg.c
> > @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
> >   * allocate and map the skcipher extended descriptor for skcipher
> >   */
> >  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> > skcipher_request *req,
> > -                                                  int desc_bytes)
> > +                                                  int desc_bytes,
> > +                                                  u8 const *input_iv)
> >  {
> >         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
> >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> > *skcipher_edesc_alloc(struct skcipher_request *req,
> >         /* Make sure IV is located in a DMAable area */
> >         if (ivsize) {
> >                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> > -               memcpy(iv, req->iv, ivsize);
> > +               memcpy(iv, input_iv, ivsize);
> >
> >                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
> >                 if (dma_mapping_error(jrdev, iv_dma)) {
> > @@ -1801,7 +1802,8 @@ static int skcipher_encrypt(struct skcipher_request
> > *req)
> >         int ret = 0;
> >
> >         /* allocate extended descriptor */
> > -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ,
> > +                                    req->iv);
> >         if (IS_ERR(edesc))
> >                 return PTR_ERR(edesc);
> >
> > @@ -1832,13 +1834,11 @@ static int skcipher_decrypt(struct
> > skcipher_request *req)
> >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> >         int ivsize = crypto_skcipher_ivsize(skcipher);
> >         struct device *jrdev = ctx->jrdev;
> > +       u8 in_iv[AES_BLOCK_SIZE];
> >         u32 *desc;
> >         int ret = 0;
> >
> > -       /* allocate extended descriptor */
> > -       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ);
> > -       if (IS_ERR(edesc))
> > -               return PTR_ERR(edesc);
> > +       memcpy(in_iv, req->iv, ivsize);
> >
> >         /*
> >          * The crypto API expects us to set the IV (req->iv) to the last
> > @@ -1848,6 +1848,11 @@ static int skcipher_decrypt(struct skcipher_request
> > *req)
> >                 scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen
> > -
> >                                          ivsize, ivsize, 0);
> >
> > +       /* allocate extended descriptor */
> > +       edesc = skcipher_edesc_alloc(req, DESC_JOB_IO_LEN * CAAM_CMD_SZ,
> > in_iv);
> > +       if (IS_ERR(edesc))
> > +               return PTR_ERR(edesc);
> > +
> >         /* Create and submit job descriptor*/
> >         init_skcipher_job(req, edesc, false);
> >         desc = edesc->hw_desc;
>
> Interesting. This discussion made me reevaluate my own implementation.
>
> First thing I realised is that I *am* currently doing the IV copying with
> the data buffer mapped for DMA ... If I understand correctly that may be a
> bad idea on some systems? I which case I will need to do my copy earlier.
>

Yes, this may break on non-coherent systems, since the DMA layer does
not expect cachelines covering the mapped region to enter the dirty
state while the mapping is active. DMA unmap does a clean+invalidate
to make the data written to main memory by the device visible to the
CPU, and it does not expect the 'clean' part of that operation to
result in any writebacks. If such a writeback does occur, it does so
at cacheline granularity, therefore potentially overwriting some data
that was just written by the device.

> Second thing is the above implementation made me realise I don't need to
> buffer the IV as part of the skcipher_request_ctx, I can just copy the
> original req->iv to some local variable, pass a pointer to that along and
> then overwrite req->iv directly. More elegant and hopefully also more
> efficient ...
>
Herbert Xu May 30, 2019, 3:13 p.m. UTC | #26
On Thu, May 30, 2019 at 05:10:06PM +0200, Ard Biesheuvel wrote:
>
> Are there any generic templates relying on this for other algos than CBC?

algif_skcipher relies on this.

Cheers,
Ard Biesheuvel May 30, 2019, 3:17 p.m. UTC | #27
On Thu, 30 May 2019 at 17:13, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 05:10:06PM +0200, Ard Biesheuvel wrote:
> >
> > Are there any generic templates relying on this for other algos than CBC?
>
> algif_skcipher relies on this.
>

I see.

In any case, that one line patch would still make things substantially
better, given that the output IV is already wrong for all algorithms
except CBC anyway, but with the patch applied, at least it no longer
corrupts the decrypted plaintext when using GCM or CCM.
Iuliana Prodan May 30, 2019, 10 p.m. UTC | #28
On 5/30/2019 6:05 PM, Ard Biesheuvel wrote:
> On Thu, 30 May 2019 at 16:34, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>>
>> On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
>>>
>>> This might work:
>>
>> Looks good to me.
>>
> 
> Thanks Herbert,
> 
> But given your remark regarding CBC being the only algo that has this
> requirement, I wonder if this might be sufficient as well.
> 
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index c0ece44f303b..65b050e3742f 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -1844,7 +1844,7 @@ static int skcipher_decrypt(struct skcipher_request *req)
>           * The crypto API expects us to set the IV (req->iv) to the last
>           * ciphertext block.
>           */
> -       if (ivsize)
> +       if (ctx->cdata.algtype & OP_ALG_AAI_CBC)
>                  scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
>                                           ivsize, ivsize, 0);
> 
> 
> Iulia, Horia?
> 
I can confirm that gcm (and ccm), with ctr-aes-caam, is passing with the 
above fix.

Thanks,
Iulia
Horia Geanta May 31, 2019, 5:22 a.m. UTC | #29
On 5/30/2019 4:26 PM, Herbert Xu wrote:
> On Thu, May 30, 2019 at 01:18:34PM +0000, Horia Geanta wrote:
>>
>> I guess there are only two options:
>> -either cache line sharing is avoided OR
>> -users need to be *aware* they are sharing the cache line and some rules /
>> assumptions are in place on how to safely work on the data
> 
> No there is a third option and it's very simple:
> 
I see this as the 2nd option.

> You must only access the virtual addresses given to you before DMA
> mapping or after DMA unmapping.
> 
Unless it's clearly defined *which* virtual addresses mustn't be accessed,
things won't work properly.
In theory, any two objects could share a cache line. We can't just stop all
memory accesses from CPU while a peripheral is busy.

Thanks,
Horia
Herbert Xu May 31, 2019, 5:42 a.m. UTC | #30
On Fri, May 31, 2019 at 05:22:50AM +0000, Horia Geanta wrote:
>
> Unless it's clearly defined *which* virtual addresses mustn't be accessed,
> things won't work properly.
> In theory, any two objects could share a cache line. We can't just stop all
> memory accesses from CPU while a peripheral is busy.

The user obviously can't touch the memory areas potentially under
DMA.  But in this case it's not the user that's doing it, it's
the driver.

So the driver must not touch any virtual pointers given to it
as input/output while the DMA areas are mapped.

Cheers,
Horia Geanta May 31, 2019, 5:54 a.m. UTC | #31
On 5/31/2019 1:00 AM, Iuliana Prodan wrote:
> On 5/30/2019 6:05 PM, Ard Biesheuvel wrote:
>> On Thu, 30 May 2019 at 16:34, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>>>
>>> On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
>>>>
>>>> This might work:
>>>
>>> Looks good to me.
>>>
>>
>> Thanks Herbert,
>>
>> But given your remark regarding CBC being the only algo that has this
>> requirement, I wonder if this might be sufficient as well.
>>
>> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
>> index c0ece44f303b..65b050e3742f 100644
>> --- a/drivers/crypto/caam/caamalg.c
>> +++ b/drivers/crypto/caam/caamalg.c
>> @@ -1844,7 +1844,7 @@ static int skcipher_decrypt(struct skcipher_request *req)
>>           * The crypto API expects us to set the IV (req->iv) to the last
>>           * ciphertext block.
>>           */
>> -       if (ivsize)
>> +       if (ctx->cdata.algtype & OP_ALG_AAI_CBC)
>>                  scatterwalk_map_and_copy(req->iv, req->src, req->cryptlen -
>>                                           ivsize, ivsize, 0);
>>
>>
>> Iulia, Horia?
>>
> I can confirm that gcm (and ccm), with ctr-aes-caam, is passing with the 
> above fix.
> 
The check should be:
	if ((ctx->cdata.algtype & OP_ALG_AAI_MASK) == OP_ALG_AAI_CBC)

Having this workaround is probably ok, until we properly fix the IV update for
CTR mode.

Thanks,
Horia
Horia Geanta May 31, 2019, 6:05 a.m. UTC | #32
On 5/30/2019 4:26 PM, Ard Biesheuvel wrote:
> On Thu, 30 May 2019 at 15:18, Horia Geanta <horia.geanta@nxp.com> wrote:
>>
>> On 5/30/2019 11:08 AM, Ard Biesheuvel wrote:
>>> On Thu, 30 May 2019 at 09:46, Horia Geanta <horia.geanta@nxp.com> wrote:
>>>>
>>>> On 5/30/2019 8:34 AM, Herbert Xu wrote:
>>>>> It would appear that Ard's latest suggestion should fix the problem
>>>>> and is the correct approach.
>>>>>
>>>> I disagree.
>>>> We shouldn't force crypto implementations to be aware of such inconsistencies in
>>>> the I/O data buffers (pointed to by src/dst scatterlists) that are supposed to
>>>> be safely DMA mapped.
>>>>
>>>
>>> I'm on the fence here. On the one hand, it is slightly dodgy for the
>>> GCM driver to pass a scatterlist referencing a buffer that shares a
>>> cacheline with another buffer passed by an ordinary pointer, and for
>>> which an explicit requirement exists that the callee should update it
>>> before returning.
>>>
>>> On the other hand, I think it is reasonable to require drivers not to
>>> perform such updates while the scatterlist is mapped for DMA, since
>>> fixing it in the callers puts a disproportionate burden on them, given
>>> that non-coherent DMA only represents a small minority of use cases.
>>>
>> The problem with this approach is that the buffers in the scatterlist could
>> hypothetically share cache lines with *any* other CPU-updated data, not just the
>> IV in the crypto request (as it happens here).
>> How could a non-coherent DMA implementation cope with this?
>>
> 
> I don't think the situation is that bad. We only support allocations
> in the linear map for DMA/scatterlists, and buffers in the linear map
> can only share a cacheline if they were allocated using the same
> kmalloc() invocation (on systems that support non-coherent DMA)
> 
> That does mean that it is actually more straightforward to deal with
> this at the level where the allocation occurs, and that would justify
> dealing with it in the callers.
> 
> However, from the callee's point of view, it simply means that you
> should not dereference any request struct pointers for writing while
> the 'dst' scatterlist is mapped for DMA.
> 
Is this the only restriction, or I might find out more in the future?

This kind of violation of an abstraction layer must be clearly documented.

In particular, if crypto API implementations cannot rely on DMA API guarantees,
then exceptions must be listed.

Thanks,
Horia
Horia Geanta May 31, 2019, 6:10 a.m. UTC | #33
On 5/31/2019 8:43 AM, Herbert Xu wrote:
> On Fri, May 31, 2019 at 05:22:50AM +0000, Horia Geanta wrote:
>>
>> Unless it's clearly defined *which* virtual addresses mustn't be accessed,
>> things won't work properly.
>> In theory, any two objects could share a cache line. We can't just stop all
>> memory accesses from CPU while a peripheral is busy.
> 
> The user obviously can't touch the memory areas potentially under
> DMA.  But in this case it's not the user that's doing it, it's
> the driver.
> 
> So the driver must not touch any virtual pointers given to it
> as input/output while the DMA areas are mapped.
> 
Driver is not touching the DMA mapped areas, the DMA API conventions are
carefully followed.
It's touching a virtual pointer that is not DMA mapped, that just happens to be
on the same cache line with a DMA mapped buffer.

Thanks,
Horia
Herbert Xu May 31, 2019, 6:14 a.m. UTC | #34
On Fri, May 31, 2019 at 06:10:51AM +0000, Horia Geanta wrote:
>
> Driver is not touching the DMA mapped areas, the DMA API conventions are
> carefully followed.
> It's touching a virtual pointer that is not DMA mapped, that just happens to be
> on the same cache line with a DMA mapped buffer.

Well you can't control what the users give you so you must assume
that the virtual address always share a cacheline with the DMA
buffer.

That's why you must only operate on that virtual address either
before you DMA map or after you DMA unmap.  Virtual addresses
that you allocate yourself (including ones on the stack) are
obviously not subject to this restriction.

Cheers,
Herbert Xu June 6, 2019, 6:37 a.m. UTC | #35
On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
>
> This might work:
>
> diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> index c0ece44f303b..3d313d2a279a 100644
> --- a/drivers/crypto/caam/caamalg.c
> +++ b/drivers/crypto/caam/caamalg.c
> @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
>   * allocate and map the skcipher extended descriptor for skcipher
>   */
>  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> skcipher_request *req,
> -                                                  int desc_bytes)
> +                                                  int desc_bytes,
> +                                                  u8 const *input_iv)
>  {
>         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
>         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> *skcipher_edesc_alloc(struct skcipher_request *req,
>         /* Make sure IV is located in a DMAable area */
>         if (ivsize) {
>                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> -               memcpy(iv, req->iv, ivsize);
> +               memcpy(iv, input_iv, ivsize);
> 
>                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
>                 if (dma_mapping_error(jrdev, iv_dma)) {

Hi Ard:

I presume you will be submitting this patch at some point?  When
you do please base it on top of your other one which I'm about to
merge.

Thanks,
Ard Biesheuvel June 6, 2019, 6:42 a.m. UTC | #36
On Thu, 6 Jun 2019 at 08:37, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
> >
> > This might work:
> >
> > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > index c0ece44f303b..3d313d2a279a 100644
> > --- a/drivers/crypto/caam/caamalg.c
> > +++ b/drivers/crypto/caam/caamalg.c
> > @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
> >   * allocate and map the skcipher extended descriptor for skcipher
> >   */
> >  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> > skcipher_request *req,
> > -                                                  int desc_bytes)
> > +                                                  int desc_bytes,
> > +                                                  u8 const *input_iv)
> >  {
> >         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
> >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> > *skcipher_edesc_alloc(struct skcipher_request *req,
> >         /* Make sure IV is located in a DMAable area */
> >         if (ivsize) {
> >                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> > -               memcpy(iv, req->iv, ivsize);
> > +               memcpy(iv, input_iv, ivsize);
> >
> >                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
> >                 if (dma_mapping_error(jrdev, iv_dma)) {
>
> Hi Ard:
>
> I presume you will be submitting this patch at some point?  When
> you do please base it on top of your other one which I'm about to
> merge.
>

I'm not sure I follow. Do you want a better fix for the CBC output IV
going forward? Or is this about other modes?
Herbert Xu June 6, 2019, 6:46 a.m. UTC | #37
On Thu, Jun 06, 2019 at 08:42:46AM +0200, Ard Biesheuvel wrote:
> On Thu, 6 Jun 2019 at 08:37, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> >
> > On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
> > >
> > > This might work:
> > >
> > > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > > index c0ece44f303b..3d313d2a279a 100644
> > > --- a/drivers/crypto/caam/caamalg.c
> > > +++ b/drivers/crypto/caam/caamalg.c
> > > @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
> > >   * allocate and map the skcipher extended descriptor for skcipher
> > >   */
> > >  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> > > skcipher_request *req,
> > > -                                                  int desc_bytes)
> > > +                                                  int desc_bytes,
> > > +                                                  u8 const *input_iv)
> > >  {
> > >         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
> > >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > > @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> > > *skcipher_edesc_alloc(struct skcipher_request *req,
> > >         /* Make sure IV is located in a DMAable area */
> > >         if (ivsize) {
> > >                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> > > -               memcpy(iv, req->iv, ivsize);
> > > +               memcpy(iv, input_iv, ivsize);
> > >
> > >                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
> > >                 if (dma_mapping_error(jrdev, iv_dma)) {
> >
> > Hi Ard:
> >
> > I presume you will be submitting this patch at some point?  When
> > you do please base it on top of your other one which I'm about to
> > merge.
> 
> I'm not sure I follow. Do you want a better fix for the CBC output IV
> going forward? Or is this about other modes?

You sent me a patch to fix CTR mode:

https://patchwork.kernel.org/patch/10969747/

But your suggested fix for CBC mode itself where we need to do the
copy (as seen quoted above) hasn't been submitted.

Cheers,
Ard Biesheuvel June 6, 2019, 6:53 a.m. UTC | #38
On Thu, 6 Jun 2019 at 08:46, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>
> On Thu, Jun 06, 2019 at 08:42:46AM +0200, Ard Biesheuvel wrote:
> > On Thu, 6 Jun 2019 at 08:37, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > >
> > > On Thu, May 30, 2019 at 04:31:09PM +0200, Ard Biesheuvel wrote:
> > > >
> > > > This might work:
> > > >
> > > > diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
> > > > index c0ece44f303b..3d313d2a279a 100644
> > > > --- a/drivers/crypto/caam/caamalg.c
> > > > +++ b/drivers/crypto/caam/caamalg.c
> > > > @@ -1661,7 +1661,8 @@ static int aead_decrypt(struct aead_request *req)
> > > >   * allocate and map the skcipher extended descriptor for skcipher
> > > >   */
> > > >  static struct skcipher_edesc *skcipher_edesc_alloc(struct
> > > > skcipher_request *req,
> > > > -                                                  int desc_bytes)
> > > > +                                                  int desc_bytes,
> > > > +                                                  u8 const *input_iv)
> > > >  {
> > > >         struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req);
> > > >         struct caam_ctx *ctx = crypto_skcipher_ctx(skcipher);
> > > > @@ -1745,7 +1746,7 @@ static struct skcipher_edesc
> > > > *skcipher_edesc_alloc(struct skcipher_request *req,
> > > >         /* Make sure IV is located in a DMAable area */
> > > >         if (ivsize) {
> > > >                 iv = (u8 *)edesc->hw_desc + desc_bytes + sec4_sg_bytes;
> > > > -               memcpy(iv, req->iv, ivsize);
> > > > +               memcpy(iv, input_iv, ivsize);
> > > >
> > > >                 iv_dma = dma_map_single(jrdev, iv, ivsize, DMA_TO_DEVICE);
> > > >                 if (dma_mapping_error(jrdev, iv_dma)) {
> > >
> > > Hi Ard:
> > >
> > > I presume you will be submitting this patch at some point?  When
> > > you do please base it on top of your other one which I'm about to
> > > merge.
> >
> > I'm not sure I follow. Do you want a better fix for the CBC output IV
> > going forward? Or is this about other modes?
>
> You sent me a patch to fix CTR mode:
>
> https://patchwork.kernel.org/patch/10969747/
>

That patch does not fix CTR mode, it just disables copying the output
IV altogether for CTR mode so the DMA corruption does not occur.

> But your suggested fix for CBC mode itself where we need to do the
> copy (as seen quoted above) hasn't been submitted.
>

That same patch 'fixes' CBC, since CBC was never broken to begin with.
The CTS driver does not have something like the auth_tag sharing the
same cacheline with the IV, so CBC has always worked fine.

So I guess what you are after is a patch that, instead of dodging the
issue by limiting the copy to CBC, does not perform the copy at all
while anything is mapped for DMA? Then we can leave it up to the NXP
engineers to fix CTR mode.
Herbert Xu June 6, 2019, 6:57 a.m. UTC | #39
On Thu, Jun 06, 2019 at 08:53:10AM +0200, Ard Biesheuvel wrote:
>
> That same patch 'fixes' CBC, since CBC was never broken to begin with.
> The CTS driver does not have something like the auth_tag sharing the
> same cacheline with the IV, so CBC has always worked fine.

CBC is broken.  Any crypto API user is allowed to place the IV
in the same position relative to the src/dst buffer.  So the driver
must deal with it.

It's just that the CTR/ghash combo happened to expose this first.

> So I guess what you are after is a patch that, instead of dodging the
> issue by limiting the copy to CBC, does not perform the copy at all
> while anything is mapped for DMA? Then we can leave it up to the NXP
> engineers to fix CTR mode.

Right, we definitely need to fix it for CBC, probably in the way that
you suggested.

We should fix CTR too but at least it should be obviously broken as
the self-test should catch this case now.

Cheers,
Horia Geanta June 6, 2019, 7:10 a.m. UTC | #40
On 6/6/2019 9:58 AM, Herbert Xu wrote:
> On Thu, Jun 06, 2019 at 08:53:10AM +0200, Ard Biesheuvel wrote:
>>
>> That same patch 'fixes' CBC, since CBC was never broken to begin with.
>> The CTS driver does not have something like the auth_tag sharing the
>> same cacheline with the IV, so CBC has always worked fine.
> 
> CBC is broken.  Any crypto API user is allowed to place the IV
> in the same position relative to the src/dst buffer.  So the driver
> must deal with it.
> 
That's the theory.
In practice we haven't encountered any issue so far, but yes this case has to be
handled properly.

> It's just that the CTR/ghash combo happened to expose this first.
> 
Yes, and that's what the patch is fixing.

>> So I guess what you are after is a patch that, instead of dodging the
>> issue by limiting the copy to CBC, does not perform the copy at all
>> while anything is mapped for DMA? Then we can leave it up to the NXP
>> engineers to fix CTR mode.
> 
> Right, we definitely need to fix it for CBC, probably in the way that
> you suggested.
> 
Not really.
I am in favor of using the HW to update the IV, which would work for all
skcipher algorithms.
I have the fix ready, will send it in a couple of days.

Thanks,
Horia
Herbert Xu June 6, 2019, 7:15 a.m. UTC | #41
On Thu, Jun 06, 2019 at 07:10:06AM +0000, Horia Geanta wrote:
>
> Not really.
> I am in favor of using the HW to update the IV, which would work for all
> skcipher algorithms.
> I have the fix ready, will send it in a couple of days.

OK that would be interesting to see.  But I presume you are still
going to do a copy after the DMA unmap since you can't do DMA to
req->iv?

Cheers,
Horia Geanta June 6, 2019, 8:36 a.m. UTC | #42
On 6/6/2019 10:16 AM, Herbert Xu wrote:
> On Thu, Jun 06, 2019 at 07:10:06AM +0000, Horia Geanta wrote:
>>
>> Not really.
>> I am in favor of using the HW to update the IV, which would work for all
>> skcipher algorithms.
>> I have the fix ready, will send it in a couple of days.
> 
> OK that would be interesting to see.  But I presume you are still
> going to do a copy after the DMA unmap since you can't do DMA to
> req->iv?
> 
Yes, an internally kmalloc-ed buffer is used for storing the IV (both input and
output IV).
Once HW finishes the job, area is DMA unmapped and then output IV is copied into
req->iv.

Regards,
Horia
Herbert Xu June 6, 2019, 9:33 a.m. UTC | #43
On Thu, Jun 06, 2019 at 08:36:52AM +0000, Horia Geanta wrote:
>
> Yes, an internally kmalloc-ed buffer is used for storing the IV (both input and
> output IV).
> Once HW finishes the job, area is DMA unmapped and then output IV is copied into
> req->iv.

That sounds good to me.  Thanks!

Patch
diff mbox series

diff --git a/crypto/gcm.c b/crypto/gcm.c
index 33f45a9..53e3ce5 100644
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -66,7 +66,7 @@  struct crypto_gcm_ghash_ctx {
 
 struct crypto_gcm_req_priv_ctx {
 	u8 iv[16];
-	u8 auth_tag[16];
+	u8 *auth_tag;
 	u8 iauth_tag[16];
 	struct scatterlist src[3];
 	struct scatterlist dst[3];
@@ -177,19 +177,23 @@  static void crypto_gcm_init_common(struct aead_request *req)
 	__be32 counter = cpu_to_be32(1);
 	struct scatterlist *sg;
 
-	memset(pctx->auth_tag, 0, sizeof(pctx->auth_tag));
+	/*
+	 * kzalloc alignment is at least the cache line size
+	 * for non-cache coherent architectures.
+	 */
+	pctx->auth_tag = kzalloc(GCM_MAX_AUTH_SIZE, GFP_KERNEL);
 	memcpy(pctx->iv, req->iv, GCM_AES_IV_SIZE);
 	memcpy(pctx->iv + GCM_AES_IV_SIZE, &counter, 4);
 
 	sg_init_table(pctx->src, 3);
-	sg_set_buf(pctx->src, pctx->auth_tag, sizeof(pctx->auth_tag));
+	sg_set_buf(pctx->src, pctx->auth_tag, GCM_MAX_AUTH_SIZE);
 	sg = scatterwalk_ffwd(pctx->src + 1, req->src, req->assoclen);
 	if (sg != pctx->src + 1)
 		sg_chain(pctx->src, 2, sg);
 
 	if (req->src != req->dst) {
 		sg_init_table(pctx->dst, 3);
-		sg_set_buf(pctx->dst, pctx->auth_tag, sizeof(pctx->auth_tag));
+		sg_set_buf(pctx->dst, pctx->auth_tag, GCM_MAX_AUTH_SIZE);
 		sg = scatterwalk_ffwd(pctx->dst + 1, req->dst, req->assoclen);
 		if (sg != pctx->dst + 1)
 			sg_chain(pctx->dst, 2, sg);
@@ -208,9 +212,8 @@  static void crypto_gcm_init_crypt(struct aead_request *req,
 	dst = req->src == req->dst ? pctx->src : pctx->dst;
 
 	skcipher_request_set_tfm(skreq, ctx->ctr);
-	skcipher_request_set_crypt(skreq, pctx->src, dst,
-				     cryptlen + sizeof(pctx->auth_tag),
-				     pctx->iv);
+	skcipher_request_set_crypt(skreq, pctx->src, dst, cryptlen +
+				   GCM_MAX_AUTH_SIZE, pctx->iv);
 }
 
 static inline unsigned int gcm_remain(unsigned int len)
@@ -440,6 +443,7 @@  static int gcm_enc_copy_hash(struct aead_request *req, u32 flags)
 	scatterwalk_map_and_copy(auth_tag, req->dst,
 				 req->assoclen + req->cryptlen,
 				 crypto_aead_authsize(aead), 1);
+	kfree(auth_tag);
 	return 0;
 }
 
@@ -492,11 +496,15 @@  static int crypto_gcm_verify(struct aead_request *req)
 	u8 *iauth_tag = pctx->iauth_tag;
 	unsigned int authsize = crypto_aead_authsize(aead);
 	unsigned int cryptlen = req->cryptlen - authsize;
+	int err;
 
 	crypto_xor(auth_tag, iauth_tag, 16);
 	scatterwalk_map_and_copy(iauth_tag, req->src,
 				 req->assoclen + cryptlen, authsize, 0);
-	return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
+	kfree(auth_tag);
+
+	return err;
 }
 
 static void gcm_decrypt_done(struct crypto_async_request *areq, int err)
@@ -678,7 +686,7 @@  static int crypto_gcm_create_common(struct crypto_template *tmpl,
 	inst->alg.base.cra_ctxsize = sizeof(struct crypto_gcm_ctx);
 	inst->alg.ivsize = GCM_AES_IV_SIZE;
 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(ctr);
-	inst->alg.maxauthsize = 16;
+	inst->alg.maxauthsize = GCM_MAX_AUTH_SIZE;
 	inst->alg.init = crypto_gcm_init_tfm;
 	inst->alg.exit = crypto_gcm_exit_tfm;
 	inst->alg.setkey = crypto_gcm_setkey;
diff --git a/include/crypto/gcm.h b/include/crypto/gcm.h
index c50e057..6b634ff 100644
--- a/include/crypto/gcm.h
+++ b/include/crypto/gcm.h
@@ -1,6 +1,7 @@ 
 #ifndef _CRYPTO_GCM_H
 #define _CRYPTO_GCM_H
 
+#define GCM_MAX_AUTH_SIZE 16
 #define GCM_AES_IV_SIZE 12
 #define GCM_RFC4106_IV_SIZE 8
 #define GCM_RFC4543_IV_SIZE 8