All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
@ 2009-01-29  1:10 Siarhei Siamashka
  2009-01-29  1:20 ` Siarhei Siamashka
  2009-01-29  7:27 ` Marcel Holtmann
  0 siblings, 2 replies; 17+ messages in thread
From: Siarhei Siamashka @ 2009-01-29  1:10 UTC (permalink / raw)
  To: linux-bluetooth

[-- Attachment #1: Type: text/plain, Size: 2099 bytes --]

Hello all,

The attached patch contains optimization for scale factors calculation which
provides additional SBC encoder speedup.

For non-gcc compilers, CLZ function is implemented with a very simple and
slow straightforward code (but it is still faster than current git code even
if used instead of __builtin_clz). Something better could be done like: 
http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=en
But I'm not sure about license/copyright of the code at this link and decided
not to touch it. Anyway, I don't think that gcc implementation of
__builtin_clz for the CPU cores which do not support CLZ instruction is any
worse.

Joint stereo processing also involves recalculation of scale factors, which
can use a similar optimization or even exactly the same function.
I intentionally did not benchmark encoding with joint stereo yet as it would
spoil the nice numbers :) That's something to improve next.

Benchmark results (sbcenc with default settings):

====

ARM Cortex-A8:

before:
real    1m 4.84s
user    1m 1.05s
sys     0m 3.78s

after:
real    0m 58.93s
user    0m 55.15s
sys     0m 3.78s

Intel Core2:

before:
real    0m7.729s
user    0m7.268s
sys     0m0.376s

after:
real    0m6.473s
user    0m6.116s
sys     0m0.292s

====

Overall, CPU usage in SBC encoder looks more or less like this (oprofile log
from ARM Cortex-A8):

samples  %        image name               symbol name
2173     30.6791  sbcenc.neon_new          sbc_encode
1774     25.0459  sbcenc.neon_new          sbc_analyze_4b_8s_neon
1525     21.5304  sbcenc.neon_new          sbc_calculate_bits
916      12.9324  sbcenc.neon_new          sbc_calc_scalefactors
600       8.4710  sbcenc.neon_new          sbc_enc_process_input_8s_be
75        1.0589  libc-2.5.so              memcpy
13        0.1835  sbcenc.neon_new          main
4         0.0565  libc-2.5.so              write
2         0.0282  sbcenc.neon_new          .plt
1         0.0141  ld-2.5.so                _dl_relocate_object

 
Best regards,
Siarhei Siamashka

[-- Attachment #2: 0001-SBC-encoder-scale-factors-calculation-optimized-with.patch --]
[-- Type: text/x-diff, Size: 4696 bytes --]

>From 90c60f04f1540fe2c7d5ab631dbd111c25b03e17 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 29 Jan 2009 02:17:36 +0200
Subject: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz

Count leading zeros operation is often implemented using a special
instruction for it on various architectures (at least this is true
for ARM and x86). Using __builtin_clz gcc intrinsic allows to
eliminate innermost loop in scale factors calculation and improve
performance. Also scale factors calculation can be optimized even
more using SIMD instructions.
---
 sbc/sbc.c            |   21 +++++----------------
 sbc/sbc_primitives.c |   41 +++++++++++++++++++++++++++++++++++++++++
 sbc/sbc_primitives.h |    4 ++++
 3 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/sbc/sbc.c b/sbc/sbc.c
index 365ee1f..8a2d782 100644
--- a/sbc/sbc.c
+++ b/sbc/sbc.c
@@ -77,7 +77,7 @@ struct sbc_frame {
 	uint8_t joint;
 
 	/* only the lower 4 bits of every element are to be used */
-	uint8_t scale_factor[2][8];
+	uint32_t scale_factor[2][8];
 
 	/* raw integer subband samples in the frame */
 	int32_t SBC_ALIGNED sb_sample_f[16][2][8];
@@ -745,8 +745,6 @@ static SBC_ALWAYS_INLINE int sbc_pack_frame_internal(
 	uint32_t levels[2][8];	/* levels are derived from that */
 	uint32_t sb_sample_delta[2][8];
 
-	u_int32_t scalefactor[2][8];	/* derived from frame->scale_factor */
-
 	data[0] = SBC_SYNCWORD;
 
 	data[1] = (frame->frequency & 0x03) << 6;
@@ -785,19 +783,6 @@ static SBC_ALWAYS_INLINE int sbc_pack_frame_internal(
 	crc_header[1] = data[2];
 	crc_pos = 16;
 
-	for (ch = 0; ch < frame_channels; ch++) {
-		for (sb = 0; sb < frame_subbands; sb++) {
-			frame->scale_factor[ch][sb] = 0;
-			scalefactor[ch][sb] = 2 << SCALE_OUT_BITS;
-			for (blk = 0; blk < frame->blocks; blk++) {
-				while (scalefactor[ch][sb] < fabs(frame->sb_sample_f[blk][ch][sb])) {
-					frame->scale_factor[ch][sb]++;
-					scalefactor[ch][sb] *= 2;
-				}
-			}
-		}
-	}
-
 	if (frame->mode == JOINT_STEREO) {
 		/* like frame->sb_sample but joint stereo */
 		int32_t sb_sample_j[16][2];
@@ -1115,6 +1100,10 @@ int sbc_encode(sbc_t *sbc, void *input, int input_len, void *output,
 
 	samples = sbc_analyze_audio(&priv->enc_state, &priv->frame);
 
+	priv->enc_state.sbc_calc_scalefactors(
+		priv->frame.sb_sample_f, priv->frame.scale_factor,
+		priv->frame.blocks, priv->frame.channels, priv->frame.subbands);
+
 	framelen = sbc_pack_frame(output, &priv->frame, output_len);
 
 	if (written)
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index 338feb9..303f3fe 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -401,6 +401,44 @@ static int sbc_enc_process_input_8s_be(int position,
 			position, pcm, X, nsamples, 1, 1);
 }
 
+/* Supplementary function to count the number of leading zeros */
+
+static inline int sbc_clz(uint32_t x)
+{
+#ifdef __GNUC__
+	return __builtin_clz(x);
+#else
+	/* TODO: this should be replaced with something better if good
+	 * performance is wanted when using compilers other than gcc */
+	int cnt = 0;
+	while (x) {
+		cnt++;
+		x >>= 1;
+	}
+	return 32 - cnt;
+#endif
+}
+
+static void sbc_calc_scalefactors(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	int ch, sb, blk;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb++) {
+			uint32_t x = 1 << SCALE_OUT_BITS;
+			for (blk = 0; blk < blocks; blk++) {
+				int32_t tmp = fabs(sb_sample_f[blk][ch][sb]);
+				if (tmp != 0)
+					x |= tmp - 1;
+			}
+			scale_factor[ch][sb] = (31 - SCALE_OUT_BITS) -
+				sbc_clz(x);
+		}
+	}
+}
+
 /*
  * Detect CPU features and setup function pointers
  */
@@ -416,6 +454,9 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 	state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le;
 	state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be;
 
+	/* Default implementation for scale factors calculation */
+	state->sbc_calc_scalefactors = sbc_calc_scalefactors;
+
 	/* X86/AMD64 optimizations */
 #ifdef SBC_BUILD_WITH_MMX_SUPPORT
 	sbc_init_primitives_mmx(state);
diff --git a/sbc/sbc_primitives.h b/sbc/sbc_primitives.h
index 5b7c9ac..2708c82 100644
--- a/sbc/sbc_primitives.h
+++ b/sbc/sbc_primitives.h
@@ -58,6 +58,10 @@ struct sbc_encoder_state {
 	int (*sbc_enc_process_input_8s_be)(int position,
 			const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
 			int nsamples, int nchannels);
+	/* Scale factors calculation */
+	void (*sbc_calc_scalefactors)(int32_t sb_sample_f[16][2][8],
+			uint32_t scale_factor[2][8],
+			int blocks, int channels, int subbands);
 };
 
 /*
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  1:10 [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz Siarhei Siamashka
@ 2009-01-29  1:20 ` Siarhei Siamashka
  2009-01-29  7:28   ` Marcel Holtmann
  2009-01-29  9:51   ` Johan Hedberg
  2009-01-29  7:27 ` Marcel Holtmann
  1 sibling, 2 replies; 17+ messages in thread
From: Siarhei Siamashka @ 2009-01-29  1:20 UTC (permalink / raw)
  To: linux-bluetooth

[-- Attachment #1: Type: text/plain, Size: 364 bytes --]

On Thursday 29 January 2009 03:10:03 ext Siarhei Siamashka wrote:
> The attached patch contains optimization for scale factors calculation
> which provides additional SBC encoder speedup.

And MMX variant of this optimization can be implemented with something
like this patch. It still needs to be tested on X86-64 systems though.

Best regards,
Siarhei Siamashka

[-- Attachment #2: sbc-scalefactors-mmx.diff --]
[-- Type: text/x-diff, Size: 2014 bytes --]

diff --git a/sbc/sbc_primitives_mmx.c b/sbc/sbc_primitives_mmx.c
index 7db4af7..41c0241 100644
--- a/sbc/sbc_primitives_mmx.c
+++ b/sbc/sbc_primitives_mmx.c
@@ -275,6 +275,59 @@ static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out,
 	asm volatile ("emms\n");
 }
 
+static void sbc_calc_scalefactors_mmx(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	static const SBC_ALIGNED int32_t consts[2] = {
+		1 << SCALE_OUT_BITS,
+		1 << SCALE_OUT_BITS,
+	};
+	int ch, sb;
+	intptr_t blk;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+				(char *) &sb_sample_f[0][0][0]));
+			asm volatile (
+				"movq         (%4), %%mm0\n"
+			"1:\n"
+				"movq     (%1, %0), %%mm1\n"
+				"pxor        %%mm2, %%mm2\n"
+				"pcmpgtd     %%mm2, %%mm1\n"
+				"paddd    (%1, %0), %%mm1\n"
+				"pcmpgtd     %%mm1, %%mm2\n"
+				"pxor        %%mm2, %%mm1\n"
+
+				"por         %%mm1, %%mm0\n"
+
+				"sub            %2, %0\n"
+				"jns            1b\n"
+
+				"movd        %%mm0, %k0\n"
+				"psrlq         $32, %%mm0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, (%3)\n"
+
+				"movd        %%mm0, %k0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, 4(%3)\n"
+			: "+r" (blk)
+			: "r" (&sb_sample_f[0][ch][sb]),
+				"i" ((char *) &sb_sample_f[1][0][0] -
+					(char *) &sb_sample_f[0][0][0]),
+				"r" (&scale_factor[ch][sb]),
+				"r" (&consts),
+				"i" (SCALE_OUT_BITS)
+			: "memory");
+		}
+	}
+	asm volatile ("emms\n");
+}
+
 static int check_mmx_support()
 {
 #ifdef __amd64__
@@ -313,6 +366,8 @@ void sbc_init_primitives_mmx(struct sbc_encoder_state *state)
 	if (check_mmx_support()) {
 		state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx;
 		state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx;
+
+		state->sbc_calc_scalefactors = sbc_calc_scalefactors_mmx;
 	}
 }
 

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  1:10 [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz Siarhei Siamashka
  2009-01-29  1:20 ` Siarhei Siamashka
@ 2009-01-29  7:27 ` Marcel Holtmann
  2009-01-29 15:00   ` Christian Hoene
  2009-02-02 11:11   ` Siarhei Siamashka
  1 sibling, 2 replies; 17+ messages in thread
From: Marcel Holtmann @ 2009-01-29  7:27 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

Hi Siarhei,

> The attached patch contains optimization for scale factors calculation which
> provides additional SBC encoder speedup.
> 
> For non-gcc compilers, CLZ function is implemented with a very simple and
> slow straightforward code (but it is still faster than current git code even
> if used instead of __builtin_clz). Something better could be done like: 
> http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=en
> But I'm not sure about license/copyright of the code at this link and decided
> not to touch it. Anyway, I don't think that gcc implementation of
> __builtin_clz for the CPU cores which do not support CLZ instruction is any
> worse.

personally I don't really care about non-gcc compilers. I think that
some of the BlueZ source might not even compile without gcc.

Anyway, patch has been applied. Thanks.

Regards

Marcel



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  1:20 ` Siarhei Siamashka
@ 2009-01-29  7:28   ` Marcel Holtmann
  2009-01-29  9:07     ` Christian Hoene
  2009-01-29  9:51   ` Johan Hedberg
  1 sibling, 1 reply; 17+ messages in thread
From: Marcel Holtmann @ 2009-01-29  7:28 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

Hi Siarhei,

> > The attached patch contains optimization for scale factors calculation
> > which provides additional SBC encoder speedup.
> 
> And MMX variant of this optimization can be implemented with something
> like this patch. It still needs to be tested on X86-64 systems though.

I am holding off on this until I get confirmation that is does the right
thing. So tester, please test this patch.

Regards

Marcel



^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  7:28   ` Marcel Holtmann
@ 2009-01-29  9:07     ` Christian Hoene
  0 siblings, 0 replies; 17+ messages in thread
From: Christian Hoene @ 2009-01-29  9:07 UTC (permalink / raw)
  To: 'Marcel Holtmann', 'Siarhei Siamashka'; +Cc: linux-bluetooth

> Hi Siarhei,
> 
> > > The attached patch contains optimization for scale factors calculation
> > > which provides additional SBC encoder speedup.
> >
> > And MMX variant of this optimization can be implemented with something
> > like this patch. It still needs to be tested on X86-64 systems though.
> 
> I am holding off on this until I get confirmation that is does the right
> thing. So tester, please test this patch.

Yes, Sir!
I'll do it.

Regards,
 Christian


> 
> Regards
> 
> Marcel
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bluetooth"
in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  1:20 ` Siarhei Siamashka
  2009-01-29  7:28   ` Marcel Holtmann
@ 2009-01-29  9:51   ` Johan Hedberg
  1 sibling, 0 replies; 17+ messages in thread
From: Johan Hedberg @ 2009-01-29  9:51 UTC (permalink / raw)
  To: linux-bluetooth

Hi Siarhei,

On Thu, Jan 29, 2009, Siarhei Siamashka wrote:
> On Thursday 29 January 2009 03:10:03 ext Siarhei Siamashka wrote:
> > The attached patch contains optimization for scale factors calculation
> > which provides additional SBC encoder speedup.
> 
> And MMX variant of this optimization can be implemented with something
> like this patch. It still needs to be tested on X86-64 systems though.

Seems to work fine on my Lenovo X301 (Core 2 Duo) with ubuntu intrepid
amd64.

Johan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  7:27 ` Marcel Holtmann
@ 2009-01-29 15:00   ` Christian Hoene
  2009-01-29 15:30     ` Siarhei Siamashka
  2009-02-02 11:11   ` Siarhei Siamashka
  1 sibling, 1 reply; 17+ messages in thread
From: Christian Hoene @ 2009-01-29 15:00 UTC (permalink / raw)
  To: 'Marcel Holtmann', 'Siarhei Siamashka'; +Cc: linux-bluetooth

> 
> > The attached patch contains optimization for scale factors calculation
which
> > provides additional SBC encoder speedup.
> >
> > For non-gcc compilers, CLZ function is implemented with a very simple
and
> > slow straightforward code (but it is still faster than current git code
even
> > if used instead of __builtin_clz). Something better could be done like:
> > http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=en
> > But I'm not sure about license/copyright of the code at this link and
> decided
> > not to touch it. Anyway, I don't think that gcc implementation of
> > __builtin_clz for the CPU cores which do not support CLZ instruction is
any
> > worse.
> 
> personally I don't really care about non-gcc compilers. I think that
> some of the BlueZ source might not even compile without gcc.
> 
> Anyway, patch has been applied. Thanks.

The testing results are not positive. It is better to revoke the patch.
http://net.cs.uni-tuebingen.de/html/nexgenvoip/
http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/encoder.bluez.03.wav


Greetings

 Christian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29 15:00   ` Christian Hoene
@ 2009-01-29 15:30     ` Siarhei Siamashka
  2009-01-29 16:31       ` Siarhei Siamashka
  0 siblings, 1 reply; 17+ messages in thread
From: Siarhei Siamashka @ 2009-01-29 15:30 UTC (permalink / raw)
  To: ext Christian Hoene; +Cc: 'Marcel Holtmann', linux-bluetooth

On Thursday 29 January 2009 17:00:11 ext Christian Hoene wrote:
> > > The attached patch contains optimization for scale factors calculation
>
> which
>
> > > provides additional SBC encoder speedup.
> > >
> > > For non-gcc compilers, CLZ function is implemented with a very simple
>
> and
>
> > > slow straightforward code (but it is still faster than current git code
>
> even
>
> > > if used instead of __builtin_clz). Something better could be done like:
> > > http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=en
> > > But I'm not sure about license/copyright of the code at this link and
> >
> > decided
> >
> > > not to touch it. Anyway, I don't think that gcc implementation of
> > > __builtin_clz for the CPU cores which do not support CLZ instruction is
>
> any
>
> > > worse.
> >
> > personally I don't really care about non-gcc compilers. I think that
> > some of the BlueZ source might not even compile without gcc.
> >
> > Anyway, patch has been applied. Thanks.
>
> The testing results are not positive. It is better to revoke the patch.
> http://net.cs.uni-tuebingen.de/html/nexgenvoip/
> http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/encoder.bluez.03.wav

Thanks for finding the bug. A common things about these failed testcases
is that block size is not 16.

Looks like this was not covered by my own regression tests, I'll try to do 
something about this problem now.


Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29 15:30     ` Siarhei Siamashka
@ 2009-01-29 16:31       ` Siarhei Siamashka
  2009-01-29 17:01         ` Marcel Holtmann
  0 siblings, 1 reply; 17+ messages in thread
From: Siarhei Siamashka @ 2009-01-29 16:31 UTC (permalink / raw)
  To: ext Christian Hoene; +Cc: 'Marcel Holtmann', linux-bluetooth

[-- Attachment #1: Type: text/plain, Size: 1715 bytes --]

On Thursday 29 January 2009 17:30:31 ext Siarhei Siamashka wrote:
> On Thursday 29 January 2009 17:00:11 ext Christian Hoene wrote:
> > > > The attached patch contains optimization for scale factors
> > > > calculation
> >
> > which
> >
> > > > provides additional SBC encoder speedup.
> > > >
> > > > For non-gcc compilers, CLZ function is implemented with a very simple
> >
> > and
> >
> > > > slow straightforward code (but it is still faster than current git
> > > > code
> >
> > even
> >
> > > > if used instead of __builtin_clz). Something better could be done
> > > > like:
> > > > http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=e
> > > >n But I'm not sure about license/copyright of the code at this link
> > > > and
> > >
> > > decided
> > >
> > > > not to touch it. Anyway, I don't think that gcc implementation of
> > > > __builtin_clz for the CPU cores which do not support CLZ instruction
> > > > is
> >
> > any
> >
> > > > worse.
> > >
> > > personally I don't really care about non-gcc compilers. I think that
> > > some of the BlueZ source might not even compile without gcc.
> > >
> > > Anyway, patch has been applied. Thanks.
> >
> > The testing results are not positive. It is better to revoke the patch.
> > http://net.cs.uni-tuebingen.de/html/nexgenvoip/
> > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/encoder.bluez.03.wav
>
> Thanks for finding the bug. A common things about these failed testcases
> is that block size is not 16.
>
> Looks like this was not covered by my own regression tests, I'll try to do
> something about this problem now.

A fix is attached. I also extended my regression test script to cover such
cases.


Best regards,
Siarhei Siamashka

[-- Attachment #2: 0001-Fix-for-SBC-encoding-with-block-sizes-other-than-16.patch --]
[-- Type: text/x-diff, Size: 1757 bytes --]

>From d9a207576eb9b1df3c8e820f8c34d03f7276bea9 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Thu, 29 Jan 2009 18:15:31 +0200
Subject: [PATCH] Fix for SBC encoding with block sizes other than 16

Thanks to Christian Hoene for finding and reporting the
problem. This regression was intruduced in commit
19af3c49e61aa046375497108e05a3a0605da158
---
 sbc/sbc.c |   19 +++++++++++++------
 1 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/sbc/sbc.c b/sbc/sbc.c
index 8a2d782..29f1d14 100644
--- a/sbc/sbc.c
+++ b/sbc/sbc.c
@@ -651,30 +651,37 @@ static int sbc_analyze_audio(struct sbc_encoder_state *state,
 				struct sbc_frame *frame)
 {
 	int ch, blk;
+	int16_t *x;
 
 	switch (frame->subbands) {
 	case 4:
-		for (ch = 0; ch < frame->channels; ch++)
+		for (ch = 0; ch < frame->channels; ch++) {
+			x = &state->X[ch][state->position - 16 +
+							frame->blocks * 4];
 			for (blk = 0; blk < frame->blocks; blk += 4) {
 				state->sbc_analyze_4b_4s(
-					&state->X[ch][state->position +
-							48 - blk * 4],
+					x,
 					frame->sb_sample_f[blk][ch],
 					frame->sb_sample_f[blk + 1][ch] -
 					frame->sb_sample_f[blk][ch]);
+				x -= 16;
 			}
+		}
 		return frame->blocks * 4;
 
 	case 8:
-		for (ch = 0; ch < frame->channels; ch++)
+		for (ch = 0; ch < frame->channels; ch++) {
+			x = &state->X[ch][state->position - 32 +
+							frame->blocks * 8];
 			for (blk = 0; blk < frame->blocks; blk += 4) {
 				state->sbc_analyze_4b_8s(
-					&state->X[ch][state->position +
-							96 - blk * 8],
+					x,
 					frame->sb_sample_f[blk][ch],
 					frame->sb_sample_f[blk + 1][ch] -
 					frame->sb_sample_f[blk][ch]);
+				x -= 32;
 			}
+		}
 		return frame->blocks * 8;
 
 	default:
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29 16:31       ` Siarhei Siamashka
@ 2009-01-29 17:01         ` Marcel Holtmann
  2009-01-30 11:14           ` Christian Hoene
  0 siblings, 1 reply; 17+ messages in thread
From: Marcel Holtmann @ 2009-01-29 17:01 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: ext Christian Hoene, linux-bluetooth

Hi Siarhei,

> > > The testing results are not positive. It is better to revoke the patch.
> > > http://net.cs.uni-tuebingen.de/html/nexgenvoip/
> > > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/encoder.bluez.03.wav
> >
> > Thanks for finding the bug. A common things about these failed testcases
> > is that block size is not 16.
> >
> > Looks like this was not covered by my own regression tests, I'll try to do
> > something about this problem now.
> 
> A fix is attached. I also extended my regression test script to cover such
> cases.

patch has been applied. Thanks.

Regards

Marcel



^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29 17:01         ` Marcel Holtmann
@ 2009-01-30 11:14           ` Christian Hoene
  2009-01-30 17:05             ` Siarhei Siamashka
  0 siblings, 1 reply; 17+ messages in thread
From: Christian Hoene @ 2009-01-30 11:14 UTC (permalink / raw)
  To: 'Marcel Holtmann', 'Siarhei Siamashka'; +Cc: linux-bluetooth

> > > Thanks for finding the bug. A common things about these failed
testcases
> > > is that block size is not 16.
> > >
> > > Looks like this was not covered by my own regression tests, I'll try
to do
> > > something about this problem now.
> >
> > A fix is attached. I also extended my regression test script to cover
such
> > cases.
> 
> patch has been applied. Thanks.
> 

The tests have passed all.
http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/

Regards,

 Christian

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-30 11:14           ` Christian Hoene
@ 2009-01-30 17:05             ` Siarhei Siamashka
  2009-02-01 16:53               ` Marcel Holtmann
  0 siblings, 1 reply; 17+ messages in thread
From: Siarhei Siamashka @ 2009-01-30 17:05 UTC (permalink / raw)
  To: ext Christian Hoene; +Cc: 'Marcel Holtmann', linux-bluetooth

On Friday 30 January 2009 13:14:47 ext Christian Hoene wrote:
> > > > Thanks for finding the bug. A common things about these failed
>
> testcases
>
> > > > is that block size is not 16.
> > > >
> > > > Looks like this was not covered by my own regression tests, I'll try
>
> to do
>
> > > > something about this problem now.
> > >
> > > A fix is attached. I also extended my regression test script to cover
>
> such
>
> > > cases.
> >
> > patch has been applied. Thanks.
>
> The tests have passed all.
> http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/
>
> Regards,
>
>  Christian

Thanks a lot for keeping an eye on bluez sbc implementation quality.

-- 
Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-30 17:05             ` Siarhei Siamashka
@ 2009-02-01 16:53               ` Marcel Holtmann
  2009-02-02 10:48                 ` Siarhei Siamashka
  0 siblings, 1 reply; 17+ messages in thread
From: Marcel Holtmann @ 2009-02-01 16:53 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: ext Christian Hoene, linux-bluetooth

Hi Siarhei,

> > > > cases.
> > >
> > > patch has been applied. Thanks.
> >
> > The tests have passed all.
> > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/
> >
>
> Thanks a lot for keeping an eye on bluez sbc implementation quality.

are all patches applied now or am I missing one?

Regards

Marcel



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-02-01 16:53               ` Marcel Holtmann
@ 2009-02-02 10:48                 ` Siarhei Siamashka
  2009-02-02 15:20                   ` Marcel Holtmann
  0 siblings, 1 reply; 17+ messages in thread
From: Siarhei Siamashka @ 2009-02-02 10:48 UTC (permalink / raw)
  To: ext Marcel Holtmann; +Cc: linux-bluetooth

On Sunday 01 February 2009 18:53:43 ext Marcel Holtmann wrote:
> Hi Siarhei,
>
> > > > > cases.
> > > >
> > > > patch has been applied. Thanks.
> > >
> > > The tests have passed all.
> > > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/
> >
> > Thanks a lot for keeping an eye on bluez sbc implementation quality.
>
> are all patches applied now or am I missing one?

Yes, thanks, all the patches have been applied except for this one:
http://marc.info/?l=linux-bluetooth&m=123319235104928&w=2

It should work fine, but I'm still considering how to better optimize
processing of joint stereo, which might also require changing this code
a bit. For now it's more like a demonstration that scale factors processing 
can be SIMD optimized quite fine.

I will provide a more complete patch with optimizations for both scale factors
and joint stereo a bit later.

-- 
Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-01-29  7:27 ` Marcel Holtmann
  2009-01-29 15:00   ` Christian Hoene
@ 2009-02-02 11:11   ` Siarhei Siamashka
  1 sibling, 0 replies; 17+ messages in thread
From: Siarhei Siamashka @ 2009-02-02 11:11 UTC (permalink / raw)
  To: ext Marcel Holtmann; +Cc: linux-bluetooth

On Thursday 29 January 2009 09:27:38 ext Marcel Holtmann wrote:
> Hi Siarhei,
>
> > The attached patch contains optimization for scale factors calculation
> > which provides additional SBC encoder speedup.
> >
> > For non-gcc compilers, CLZ function is implemented with a very simple and
> > slow straightforward code (but it is still faster than current git code
> > even if used instead of __builtin_clz). Something better could be done
> > like:
> > http://groups.google.com/group/comp.sys.arm/msg/5ae56e3a95a2345e?hl=en
> > But I'm not sure about license/copyright of the code at this link and
> > decided not to touch it. Anyway, I don't think that gcc implementation of
> > __builtin_clz for the CPU cores which do not support CLZ instruction is
> > any worse.
>
> personally I don't really care about non-gcc compilers. I think that
> some of the BlueZ source might not even compile without gcc.

At least for the SBC codec, there is no harm in trying to keep it as portable
as it is possible. Somebody might want to compile it with some other, better
optimizing compiler and link with the rest of gcc compiled bluez. Or SBC
processing can be offloaded to DSP (C55x port exists) with the rest of bluez
stack running on main core.

> Anyway, patch has been applied. Thanks.

Thanks for applying it.

-- 
Best regards,
Siarhei Siamashka

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-02-02 10:48                 ` Siarhei Siamashka
@ 2009-02-02 15:20                   ` Marcel Holtmann
  2009-03-16 19:32                     ` Siarhei Siamashka
  0 siblings, 1 reply; 17+ messages in thread
From: Marcel Holtmann @ 2009-02-02 15:20 UTC (permalink / raw)
  To: Siarhei Siamashka; +Cc: linux-bluetooth

Hi Siarhei,

> > > > > > cases.
> > > > >
> > > > > patch has been applied. Thanks.
> > > >
> > > > The tests have passed all.
> > > > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/
> > >
> > > Thanks a lot for keeping an eye on bluez sbc implementation quality.
> >
> > are all patches applied now or am I missing one?
> 
> Yes, thanks, all the patches have been applied except for this one:
> http://marc.info/?l=linux-bluetooth&m=123319235104928&w=2
> 
> It should work fine, but I'm still considering how to better optimize
> processing of joint stereo, which might also require changing this code
> a bit. For now it's more like a demonstration that scale factors processing 
> can be SIMD optimized quite fine.
> 
> I will provide a more complete patch with optimizations for both scale factors
> and joint stereo a bit later.

please re-sent this one with a proper commit message and I apply it.
Then you can start from there is you like. Or send me a complete new
one.

Regards

Marcel



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz
  2009-02-02 15:20                   ` Marcel Holtmann
@ 2009-03-16 19:32                     ` Siarhei Siamashka
  0 siblings, 0 replies; 17+ messages in thread
From: Siarhei Siamashka @ 2009-03-16 19:32 UTC (permalink / raw)
  To: ext Marcel Holtmann; +Cc: linux-bluetooth

[-- Attachment #1: Type: text/plain, Size: 1336 bytes --]

On Monday 02 February 2009 17:20:56 ext Marcel Holtmann wrote:
> Hi Siarhei,
>
> > > > > > > cases.
> > > > > >
> > > > > > patch has been applied. Thanks.
> > > > >
> > > > > The tests have passed all.
> > > > > http://net.cs.uni-tuebingen.de/html/nexgenvoip/html/
> > > >
> > > > Thanks a lot for keeping an eye on bluez sbc implementation quality.
> > >
> > > are all patches applied now or am I missing one?
> >
> > Yes, thanks, all the patches have been applied except for this one:
> > http://marc.info/?l=linux-bluetooth&m=123319235104928&w=2
> >
> > It should work fine, but I'm still considering how to better optimize
> > processing of joint stereo, which might also require changing this code
> > a bit. For now it's more like a demonstration that scale factors
> > processing can be SIMD optimized quite fine.
> >
> > I will provide a more complete patch with optimizations for both scale
> > factors and joint stereo a bit later.
>
> please re-sent this one with a proper commit message and I apply it.
> Then you can start from there is you like. Or send me a complete new
> one.

Done. Commit message added and patch attached.

time ./sbcenc tesfile.au > /dev/null

before:
real    0m6.404s
user    0m6.152s
sys     0m0.244s

after:
real    0m5.630s
user    0m5.376s
sys     0m0.248s

-- 
Best regards,
Siarhei Siamashka

[-- Attachment #2: 0002-sbc-MMX-optimization-for-scale-factors-calculation.patch --]
[-- Type: text/x-diff, Size: 2519 bytes --]

From 83bece7b4642dde20b76253b7d18228d6654fa70 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 16 Mar 2009 21:10:32 +0200
Subject: [PATCH] sbc: MMX optimization for scale factors calculation

---
 sbc/sbc_primitives_mmx.c |   54 ++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/sbc/sbc_primitives_mmx.c b/sbc/sbc_primitives_mmx.c
index 08e9ca2..d8373b3 100644
--- a/sbc/sbc_primitives_mmx.c
+++ b/sbc/sbc_primitives_mmx.c
@@ -275,6 +275,59 @@ static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out,
 	asm volatile ("emms\n");
 }
 
+static void sbc_calc_scalefactors_mmx(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	static const SBC_ALIGNED int32_t consts[2] = {
+		1 << SCALE_OUT_BITS,
+		1 << SCALE_OUT_BITS,
+	};
+	int ch, sb;
+	intptr_t blk;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+				(char *) &sb_sample_f[0][0][0]));
+			asm volatile (
+				"movq         (%4), %%mm0\n"
+			"1:\n"
+				"movq     (%1, %0), %%mm1\n"
+				"pxor        %%mm2, %%mm2\n"
+				"pcmpgtd     %%mm2, %%mm1\n"
+				"paddd    (%1, %0), %%mm1\n"
+				"pcmpgtd     %%mm1, %%mm2\n"
+				"pxor        %%mm2, %%mm1\n"
+
+				"por         %%mm1, %%mm0\n"
+
+				"sub            %2, %0\n"
+				"jns            1b\n"
+
+				"movd        %%mm0, %k0\n"
+				"psrlq         $32, %%mm0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, (%3)\n"
+
+				"movd        %%mm0, %k0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, 4(%3)\n"
+			: "+r" (blk)
+			: "r" (&sb_sample_f[0][ch][sb]),
+				"i" ((char *) &sb_sample_f[1][0][0] -
+					(char *) &sb_sample_f[0][0][0]),
+				"r" (&scale_factor[ch][sb]),
+				"r" (&consts),
+				"i" (SCALE_OUT_BITS)
+			: "memory");
+		}
+	}
+	asm volatile ("emms\n");
+}
+
 static int check_mmx_support(void)
 {
 #ifdef __amd64__
@@ -313,6 +366,7 @@ void sbc_init_primitives_mmx(struct sbc_encoder_state *state)
 	if (check_mmx_support()) {
 		state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx;
 		state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx;
+		state->sbc_calc_scalefactors = sbc_calc_scalefactors_mmx;
 		state->implementation_info = "MMX";
 	}
 }
-- 
1.5.6.5


^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-03-16 19:32 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-29  1:10 [PATCH] SBC encoder scale factors calculation optimized with __builtin_clz Siarhei Siamashka
2009-01-29  1:20 ` Siarhei Siamashka
2009-01-29  7:28   ` Marcel Holtmann
2009-01-29  9:07     ` Christian Hoene
2009-01-29  9:51   ` Johan Hedberg
2009-01-29  7:27 ` Marcel Holtmann
2009-01-29 15:00   ` Christian Hoene
2009-01-29 15:30     ` Siarhei Siamashka
2009-01-29 16:31       ` Siarhei Siamashka
2009-01-29 17:01         ` Marcel Holtmann
2009-01-30 11:14           ` Christian Hoene
2009-01-30 17:05             ` Siarhei Siamashka
2009-02-01 16:53               ` Marcel Holtmann
2009-02-02 10:48                 ` Siarhei Siamashka
2009-02-02 15:20                   ` Marcel Holtmann
2009-03-16 19:32                     ` Siarhei Siamashka
2009-02-02 11:11   ` Siarhei Siamashka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.