All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
@ 2020-08-11 18:16 Luiz Augusto von Dentz
  2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
  To: linux-bluetooth

From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>

---
 Makefile.am              |   1 +
 sbc/sbc_primitives.c     |  20 ++-
 sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
 sbc/sbc_primitives_sse.h |  38 +++++
 4 files changed, 417 insertions(+), 3 deletions(-)
 create mode 100644 sbc/sbc_primitives_sse.c
 create mode 100644 sbc/sbc_primitives_sse.h

diff --git a/Makefile.am b/Makefile.am
index 342043d..7ff0c7d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
 
 sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
 		sbc/sbc_primitives.h sbc/sbc_primitives.c \
+		sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
 		sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
 		sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
 		sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index ff343cf..97a75be 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
 #include "sbc_tables.h"
 
 #include "sbc_primitives.h"
+#include "sbc_primitives_sse.h"
 #include "sbc_primitives_mmx.h"
 #include "sbc_primitives_iwmmxt.h"
 #include "sbc_primitives_neon.h"
@@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
 	return joint;
 }
 
+static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
+{
+	__builtin_cpu_init();
+
+#ifdef SBC_BUILD_WITH_MMX_SUPPORT
+	if (__builtin_cpu_supports("mmx"))
+		sbc_init_primitives_mmx(state);
+#endif
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+	if (__builtin_cpu_supports("sse4.2"))
+		sbc_init_primitives_sse(state);
+#endif
+}
+
 /*
  * Detect CPU features and setup function pointers
  */
@@ -614,9 +630,7 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
 	state->implementation_info = "Generic C";
 
 	/* X86/AMD64 optimizations */
-#ifdef SBC_BUILD_WITH_MMX_SUPPORT
-	sbc_init_primitives_mmx(state);
-#endif
+	sbc_init_primitives_x86(state);
 
 	/* ARM optimizations */
 #ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
new file mode 100644
index 0000000..c2b729a
--- /dev/null
+++ b/sbc/sbc_primitives_sse.c
@@ -0,0 +1,361 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2020 Intel Corporation
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_sse.h"
+
+/*
+ * SSE optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+
+static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
+					const FIXED_T *consts)
+{
+	static const SBC_ALIGNED int32_t round_c[2] = {
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
+	};
+	__asm__ volatile (
+		"movq        (%0), %%mm0\n"
+		"movq       8(%0), %%mm1\n"
+		"pmaddwd     (%1), %%mm0\n"
+		"pmaddwd    8(%1), %%mm1\n"
+		"paddd       (%2), %%mm0\n"
+		"paddd       (%2), %%mm1\n"
+		"\n"
+		"movq      16(%0), %%mm2\n"
+		"movq      24(%0), %%mm3\n"
+		"pmaddwd   16(%1), %%mm2\n"
+		"pmaddwd   24(%1), %%mm3\n"
+		"paddd      %%mm2, %%mm0\n"
+		"paddd      %%mm3, %%mm1\n"
+		"\n"
+		"movq      32(%0), %%mm2\n"
+		"movq      40(%0), %%mm3\n"
+		"pmaddwd   32(%1), %%mm2\n"
+		"pmaddwd   40(%1), %%mm3\n"
+		"paddd      %%mm2, %%mm0\n"
+		"paddd      %%mm3, %%mm1\n"
+		"\n"
+		"movq      48(%0), %%mm2\n"
+		"movq      56(%0), %%mm3\n"
+		"pmaddwd   48(%1), %%mm2\n"
+		"pmaddwd   56(%1), %%mm3\n"
+		"paddd      %%mm2, %%mm0\n"
+		"paddd      %%mm3, %%mm1\n"
+		"\n"
+		"movq      64(%0), %%mm2\n"
+		"movq      72(%0), %%mm3\n"
+		"pmaddwd   64(%1), %%mm2\n"
+		"pmaddwd   72(%1), %%mm3\n"
+		"paddd      %%mm2, %%mm0\n"
+		"paddd      %%mm3, %%mm1\n"
+		"\n"
+		"psrad         %4, %%mm0\n"
+		"psrad         %4, %%mm1\n"
+		"packssdw   %%mm0, %%mm0\n"
+		"packssdw   %%mm1, %%mm1\n"
+		"\n"
+		"movq       %%mm0, %%mm2\n"
+		"pmaddwd   80(%1), %%mm0\n"
+		"pmaddwd   88(%1), %%mm2\n"
+		"\n"
+		"movq       %%mm1, %%mm3\n"
+		"pmaddwd   96(%1), %%mm1\n"
+		"pmaddwd  104(%1), %%mm3\n"
+		"paddd      %%mm1, %%mm0\n"
+		"paddd      %%mm3, %%mm2\n"
+		"\n"
+		"movq       %%mm0, (%3)\n"
+		"movq       %%mm2, 8(%3)\n"
+		:
+		: "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+			"i" (SBC_PROTO_FIXED4_SCALE)
+		: "cc", "memory");
+}
+
+static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
+							const FIXED_T *consts)
+{
+	static const SBC_ALIGNED int32_t round_c[2] = {
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
+	};
+	__asm__ volatile (
+		"movq        (%0), %%mm0\n"
+		"movq       8(%0), %%mm1\n"
+		"movq      16(%0), %%mm2\n"
+		"movq      24(%0), %%mm3\n"
+		"pmaddwd     (%1), %%mm0\n"
+		"pmaddwd    8(%1), %%mm1\n"
+		"pmaddwd   16(%1), %%mm2\n"
+		"pmaddwd   24(%1), %%mm3\n"
+		"paddd       (%2), %%mm0\n"
+		"paddd       (%2), %%mm1\n"
+		"paddd       (%2), %%mm2\n"
+		"paddd       (%2), %%mm3\n"
+		"\n"
+		"movq      32(%0), %%mm4\n"
+		"movq      40(%0), %%mm5\n"
+		"movq      48(%0), %%mm6\n"
+		"movq      56(%0), %%mm7\n"
+		"pmaddwd   32(%1), %%mm4\n"
+		"pmaddwd   40(%1), %%mm5\n"
+		"pmaddwd   48(%1), %%mm6\n"
+		"pmaddwd   56(%1), %%mm7\n"
+		"paddd      %%mm4, %%mm0\n"
+		"paddd      %%mm5, %%mm1\n"
+		"paddd      %%mm6, %%mm2\n"
+		"paddd      %%mm7, %%mm3\n"
+		"\n"
+		"movq      64(%0), %%mm4\n"
+		"movq      72(%0), %%mm5\n"
+		"movq      80(%0), %%mm6\n"
+		"movq      88(%0), %%mm7\n"
+		"pmaddwd   64(%1), %%mm4\n"
+		"pmaddwd   72(%1), %%mm5\n"
+		"pmaddwd   80(%1), %%mm6\n"
+		"pmaddwd   88(%1), %%mm7\n"
+		"paddd      %%mm4, %%mm0\n"
+		"paddd      %%mm5, %%mm1\n"
+		"paddd      %%mm6, %%mm2\n"
+		"paddd      %%mm7, %%mm3\n"
+		"\n"
+		"movq      96(%0), %%mm4\n"
+		"movq     104(%0), %%mm5\n"
+		"movq     112(%0), %%mm6\n"
+		"movq     120(%0), %%mm7\n"
+		"pmaddwd   96(%1), %%mm4\n"
+		"pmaddwd  104(%1), %%mm5\n"
+		"pmaddwd  112(%1), %%mm6\n"
+		"pmaddwd  120(%1), %%mm7\n"
+		"paddd      %%mm4, %%mm0\n"
+		"paddd      %%mm5, %%mm1\n"
+		"paddd      %%mm6, %%mm2\n"
+		"paddd      %%mm7, %%mm3\n"
+		"\n"
+		"movq     128(%0), %%mm4\n"
+		"movq     136(%0), %%mm5\n"
+		"movq     144(%0), %%mm6\n"
+		"movq     152(%0), %%mm7\n"
+		"pmaddwd  128(%1), %%mm4\n"
+		"pmaddwd  136(%1), %%mm5\n"
+		"pmaddwd  144(%1), %%mm6\n"
+		"pmaddwd  152(%1), %%mm7\n"
+		"paddd      %%mm4, %%mm0\n"
+		"paddd      %%mm5, %%mm1\n"
+		"paddd      %%mm6, %%mm2\n"
+		"paddd      %%mm7, %%mm3\n"
+		"\n"
+		"psrad         %4, %%mm0\n"
+		"psrad         %4, %%mm1\n"
+		"psrad         %4, %%mm2\n"
+		"psrad         %4, %%mm3\n"
+		"\n"
+		"packssdw   %%mm0, %%mm0\n"
+		"packssdw   %%mm1, %%mm1\n"
+		"packssdw   %%mm2, %%mm2\n"
+		"packssdw   %%mm3, %%mm3\n"
+		"\n"
+		"movq       %%mm0, %%mm4\n"
+		"movq       %%mm0, %%mm5\n"
+		"pmaddwd  160(%1), %%mm4\n"
+		"pmaddwd  168(%1), %%mm5\n"
+		"\n"
+		"movq       %%mm1, %%mm6\n"
+		"movq       %%mm1, %%mm7\n"
+		"pmaddwd  192(%1), %%mm6\n"
+		"pmaddwd  200(%1), %%mm7\n"
+		"paddd      %%mm6, %%mm4\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm2, %%mm6\n"
+		"movq       %%mm2, %%mm7\n"
+		"pmaddwd  224(%1), %%mm6\n"
+		"pmaddwd  232(%1), %%mm7\n"
+		"paddd      %%mm6, %%mm4\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm3, %%mm6\n"
+		"movq       %%mm3, %%mm7\n"
+		"pmaddwd  256(%1), %%mm6\n"
+		"pmaddwd  264(%1), %%mm7\n"
+		"paddd      %%mm6, %%mm4\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm4, (%3)\n"
+		"movq       %%mm5, 8(%3)\n"
+		"\n"
+		"movq       %%mm0, %%mm5\n"
+		"pmaddwd  176(%1), %%mm0\n"
+		"pmaddwd  184(%1), %%mm5\n"
+		"\n"
+		"movq       %%mm1, %%mm7\n"
+		"pmaddwd  208(%1), %%mm1\n"
+		"pmaddwd  216(%1), %%mm7\n"
+		"paddd      %%mm1, %%mm0\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm2, %%mm7\n"
+		"pmaddwd  240(%1), %%mm2\n"
+		"pmaddwd  248(%1), %%mm7\n"
+		"paddd      %%mm2, %%mm0\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm3, %%mm7\n"
+		"pmaddwd  272(%1), %%mm3\n"
+		"pmaddwd  280(%1), %%mm7\n"
+		"paddd      %%mm3, %%mm0\n"
+		"paddd      %%mm7, %%mm5\n"
+		"\n"
+		"movq       %%mm0, 16(%3)\n"
+		"movq       %%mm5, 24(%3)\n"
+		:
+		: "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+			"i" (SBC_PROTO_FIXED8_SCALE)
+		: "cc", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_sse(struct sbc_encoder_state *state,
+		int16_t *x, int32_t *out, int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_four_sse(x + 12, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_sse(x + 8, out, analysis_consts_fixed4_simd_even);
+	out += out_stride;
+	sbc_analyze_four_sse(x + 4, out, analysis_consts_fixed4_simd_odd);
+	out += out_stride;
+	sbc_analyze_four_sse(x + 0, out, analysis_consts_fixed4_simd_even);
+
+	__asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_4b_8s_sse(struct sbc_encoder_state *state,
+		int16_t *x, int32_t *out, int out_stride)
+{
+	/* Analyze blocks */
+	sbc_analyze_eight_sse(x + 24, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_sse(x + 16, out, analysis_consts_fixed8_simd_even);
+	out += out_stride;
+	sbc_analyze_eight_sse(x + 8, out, analysis_consts_fixed8_simd_odd);
+	out += out_stride;
+	sbc_analyze_eight_sse(x + 0, out, analysis_consts_fixed8_simd_even);
+
+	__asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+		int16_t *x, int32_t *out, int out_stride);
+
+static inline void sbc_analyze_1b_8s_sse_odd(struct sbc_encoder_state *state,
+		int16_t *x, int32_t *out, int out_stride)
+{
+	sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_odd);
+	state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_even;
+
+	__asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+		int16_t *x, int32_t *out, int out_stride)
+{
+	sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_even);
+	state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+
+	__asm__ volatile ("emms\n");
+}
+
+static void sbc_calc_scalefactors_sse(
+	int32_t sb_sample_f[16][2][8],
+	uint32_t scale_factor[2][8],
+	int blocks, int channels, int subbands)
+{
+	static const SBC_ALIGNED int32_t consts[2] = {
+		1 << SCALE_OUT_BITS,
+		1 << SCALE_OUT_BITS,
+	};
+	int ch, sb;
+	intptr_t blk;
+	for (ch = 0; ch < channels; ch++) {
+		for (sb = 0; sb < subbands; sb += 2) {
+			blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+				(char *) &sb_sample_f[0][0][0]));
+			__asm__ volatile (
+				"movq         (%4), %%mm0\n"
+			"1:\n"
+				"movq     (%1, %0), %%mm1\n"
+				"pxor        %%mm2, %%mm2\n"
+				"pcmpgtd     %%mm2, %%mm1\n"
+				"paddd    (%1, %0), %%mm1\n"
+				"pcmpgtd     %%mm1, %%mm2\n"
+				"pxor        %%mm2, %%mm1\n"
+
+				"por         %%mm1, %%mm0\n"
+
+				"sub            %2, %0\n"
+				"jns            1b\n"
+
+				"movd        %%mm0, %k0\n"
+				"psrlq         $32, %%mm0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, (%3)\n"
+
+				"movd        %%mm0, %k0\n"
+				"bsrl          %k0, %k0\n"
+				"subl           %5, %k0\n"
+				"movl          %k0, 4(%3)\n"
+			: "+r" (blk)
+			: "r" (&sb_sample_f[0][ch][sb]),
+				"i" ((char *) &sb_sample_f[1][0][0] -
+					(char *) &sb_sample_f[0][0][0]),
+				"r" (&scale_factor[ch][sb]),
+				"r" (&consts),
+				"i" (SCALE_OUT_BITS)
+			: "cc", "memory");
+		}
+	}
+	__asm__ volatile ("emms\n");
+}
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *state)
+{
+	state->sbc_analyze_4s = sbc_analyze_4b_4s_sse;
+	if (state->increment == 1)
+		state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+	else
+		state->sbc_analyze_8s = sbc_analyze_4b_8s_sse;
+	state->sbc_calc_scalefactors = sbc_calc_scalefactors_sse;
+	state->implementation_info = "SSE";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_sse.h b/sbc/sbc_primitives_sse.h
new file mode 100644
index 0000000..8830cfd
--- /dev/null
+++ b/sbc/sbc_primitives_sse.h
@@ -0,0 +1,38 @@
+/*
+ *
+ *  Bluetooth low-complexity, subband codec (SBC) library
+ *
+ *  Copyright (C) 2020  Intel Corporation
+ *
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_SSE_H
+#define __SBC_PRIMITIVES_SSE_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) && \
+		!defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_SSE_SUPPORT
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s
  2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
@ 2020-08-11 18:16 ` Luiz Augusto von Dentz
  2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
  2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
  2 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
  To: linux-bluetooth

From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>

This makes use 128 bit XMM registers whenever possible.

$ time src/sbcenc_mmx -s 4 sin_4m.au > /dev/null
real    0m1.073s
user    0m1.039s
sys     0m0.030s

===  After ====

$ time src/sbcenc -s 4 sin_4m.au > /dev/null
real    0m1.049s
user    0m1.000s
sys     0m0.047s
---
 sbc/sbc_primitives_sse.c | 58 +++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index c2b729a..6471bd5 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -38,48 +38,40 @@
 static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
 					const FIXED_T *consts)
 {
-	static const SBC_ALIGNED int32_t round_c[2] = {
+	static const SBC_ALIGNED int32_t round_c[4] = {
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
+		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 		1 << (SBC_PROTO_FIXED4_SCALE - 1),
 	};
 	__asm__ volatile (
-		"movq        (%0), %%mm0\n"
-		"movq       8(%0), %%mm1\n"
-		"pmaddwd     (%1), %%mm0\n"
-		"pmaddwd    8(%1), %%mm1\n"
-		"paddd       (%2), %%mm0\n"
-		"paddd       (%2), %%mm1\n"
+		"movdqu      (%0), %%xmm0\n"
+		"pmaddwd     (%1), %%xmm0\n"
+		"paddd       (%2), %%xmm0\n"
 		"\n"
-		"movq      16(%0), %%mm2\n"
-		"movq      24(%0), %%mm3\n"
-		"pmaddwd   16(%1), %%mm2\n"
-		"pmaddwd   24(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    16(%0), %%xmm1\n"
+		"pmaddwd   16(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      32(%0), %%mm2\n"
-		"movq      40(%0), %%mm3\n"
-		"pmaddwd   32(%1), %%mm2\n"
-		"pmaddwd   40(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    32(%0), %%xmm1\n"
+		"pmaddwd   32(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      48(%0), %%mm2\n"
-		"movq      56(%0), %%mm3\n"
-		"pmaddwd   48(%1), %%mm2\n"
-		"pmaddwd   56(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    48(%0), %%xmm1\n"
+		"pmaddwd   48(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
 		"\n"
-		"movq      64(%0), %%mm2\n"
-		"movq      72(%0), %%mm3\n"
-		"pmaddwd   64(%1), %%mm2\n"
-		"pmaddwd   72(%1), %%mm3\n"
-		"paddd      %%mm2, %%mm0\n"
-		"paddd      %%mm3, %%mm1\n"
+		"movdqu    64(%0), %%xmm1\n"
+		"pmaddwd   64(%1), %%xmm1\n"
+		"paddd     %%xmm1, %%xmm0\n"
+		"\n"
+		"psrad         %4, %%xmm0\n"
+		"\n"
+		"movdqa    %%xmm0, %%xmm1\n"
+		"punpckhqdq %%xmm1, %%xmm1\n"
+		"movdq2q   %%xmm0, %%mm0\n"
+		"movdq2q   %%xmm1, %%mm1\n"
 		"\n"
-		"psrad         %4, %%mm0\n"
-		"psrad         %4, %%mm1\n"
 		"packssdw   %%mm0, %%mm0\n"
 		"packssdw   %%mm1, %%mm1\n"
 		"\n"
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s
  2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
  2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
@ 2020-08-11 18:16 ` Luiz Augusto von Dentz
  2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
  2 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
  To: linux-bluetooth

From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>

This makes use 128 bit XMM registers whenever possible.

$ time src/sbcenc_mmx -s 8 sin_64m.au > /dev/null
real    0m1.064s
user    0m1.012s
sys     0m0.049s

===  After ====

$ time src/sbcenc -s 8 sin_64m.au > /dev/null
real    0m1.032s
user    0m0.996s
sys     0m0.033s
---
 sbc/sbc_primitives_sse.c | 109 ++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 65 deletions(-)

diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index 6471bd5..6f8eb49 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -96,80 +96,59 @@ static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
 static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
 							const FIXED_T *consts)
 {
-	static const SBC_ALIGNED int32_t round_c[2] = {
+	static const SBC_ALIGNED int32_t round_c[4] = {
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
+		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 		1 << (SBC_PROTO_FIXED8_SCALE - 1),
 	};
 	__asm__ volatile (
-		"movq        (%0), %%mm0\n"
-		"movq       8(%0), %%mm1\n"
-		"movq      16(%0), %%mm2\n"
-		"movq      24(%0), %%mm3\n"
-		"pmaddwd     (%1), %%mm0\n"
-		"pmaddwd    8(%1), %%mm1\n"
-		"pmaddwd   16(%1), %%mm2\n"
-		"pmaddwd   24(%1), %%mm3\n"
-		"paddd       (%2), %%mm0\n"
-		"paddd       (%2), %%mm1\n"
-		"paddd       (%2), %%mm2\n"
-		"paddd       (%2), %%mm3\n"
+		"movdqu      (%0), %%xmm0\n"
+		"movdqu    16(%0), %%xmm1\n"
+		"pmaddwd     (%1), %%xmm0\n"
+		"pmaddwd   16(%1), %%xmm1\n"
+		"paddd       (%2), %%xmm0\n"
+		"paddd       (%2), %%xmm1\n"
 		"\n"
-		"movq      32(%0), %%mm4\n"
-		"movq      40(%0), %%mm5\n"
-		"movq      48(%0), %%mm6\n"
-		"movq      56(%0), %%mm7\n"
-		"pmaddwd   32(%1), %%mm4\n"
-		"pmaddwd   40(%1), %%mm5\n"
-		"pmaddwd   48(%1), %%mm6\n"
-		"pmaddwd   56(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    32(%0), %%xmm2\n"
+		"movdqu    48(%0), %%xmm3\n"
+		"pmaddwd   32(%1), %%xmm2\n"
+		"pmaddwd   48(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq      64(%0), %%mm4\n"
-		"movq      72(%0), %%mm5\n"
-		"movq      80(%0), %%mm6\n"
-		"movq      88(%0), %%mm7\n"
-		"pmaddwd   64(%1), %%mm4\n"
-		"pmaddwd   72(%1), %%mm5\n"
-		"pmaddwd   80(%1), %%mm6\n"
-		"pmaddwd   88(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    64(%0), %%xmm2\n"
+		"movdqu    80(%0), %%xmm3\n"
+		"pmaddwd   64(%1), %%xmm2\n"
+		"pmaddwd   80(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq      96(%0), %%mm4\n"
-		"movq     104(%0), %%mm5\n"
-		"movq     112(%0), %%mm6\n"
-		"movq     120(%0), %%mm7\n"
-		"pmaddwd   96(%1), %%mm4\n"
-		"pmaddwd  104(%1), %%mm5\n"
-		"pmaddwd  112(%1), %%mm6\n"
-		"pmaddwd  120(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    96(%0), %%xmm2\n"
+		"movdqu   112(%0), %%xmm3\n"
+		"pmaddwd   96(%1), %%xmm2\n"
+		"pmaddwd  112(%1), %%xmm3\n"
+		"paddd     %%xmm2, %%xmm0\n"
+		"paddd     %%xmm3, %%xmm1\n"
 		"\n"
-		"movq     128(%0), %%mm4\n"
-		"movq     136(%0), %%mm5\n"
-		"movq     144(%0), %%mm6\n"
-		"movq     152(%0), %%mm7\n"
-		"pmaddwd  128(%1), %%mm4\n"
-		"pmaddwd  136(%1), %%mm5\n"
-		"pmaddwd  144(%1), %%mm6\n"
-		"pmaddwd  152(%1), %%mm7\n"
-		"paddd      %%mm4, %%mm0\n"
-		"paddd      %%mm5, %%mm1\n"
-		"paddd      %%mm6, %%mm2\n"
-		"paddd      %%mm7, %%mm3\n"
+		"movdqu    128(%0), %%xmm2\n"
+		"movdqu    144(%0), %%xmm3\n"
+		"pmaddwd   128(%1), %%xmm2\n"
+		"pmaddwd   144(%1), %%xmm3\n"
+		"paddd      %%xmm2, %%xmm0\n"
+		"paddd      %%xmm3, %%xmm1\n"
+		"\n"
+		"psrad         %4, %%xmm0\n"
+		"psrad         %4, %%xmm1\n"
 		"\n"
-		"psrad         %4, %%mm0\n"
-		"psrad         %4, %%mm1\n"
-		"psrad         %4, %%mm2\n"
-		"psrad         %4, %%mm3\n"
+		"movdqa     %%xmm0, %%xmm2\n"
+		"movdqa     %%xmm1, %%xmm3\n"
+		"punpckhqdq %%xmm2, %%xmm2\n"
+		"punpckhqdq %%xmm3, %%xmm3\n"
+		"movdq2q    %%xmm0, %%mm0\n"
+		"movdq2q    %%xmm2, %%mm1\n"
+		"movdq2q    %%xmm1, %%mm2\n"
+		"movdq2q    %%xmm3, %%mm3\n"
 		"\n"
 		"packssdw   %%mm0, %%mm0\n"
 		"packssdw   %%mm1, %%mm1\n"
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
  2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
  2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
  2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
@ 2020-08-12 12:48 ` Marcel Holtmann
  2020-08-14 20:56   ` Luiz Augusto von Dentz
  2 siblings, 1 reply; 5+ messages in thread
From: Marcel Holtmann @ 2020-08-12 12:48 UTC (permalink / raw)
  To: Luiz Augusto von Dentz; +Cc: linux-bluetooth

Hi Luiz,

> 
> ---
> Makefile.am              |   1 +
> sbc/sbc_primitives.c     |  20 ++-
> sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
> sbc/sbc_primitives_sse.h |  38 +++++
> 4 files changed, 417 insertions(+), 3 deletions(-)
> create mode 100644 sbc/sbc_primitives_sse.c
> create mode 100644 sbc/sbc_primitives_sse.h
> 
> diff --git a/Makefile.am b/Makefile.am
> index 342043d..7ff0c7d 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
> 
> sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
> 		sbc/sbc_primitives.h sbc/sbc_primitives.c \
> +		sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
> 		sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
> 		sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
> 		sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
> diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
> index ff343cf..97a75be 100644
> --- a/sbc/sbc_primitives.c
> +++ b/sbc/sbc_primitives.c
> @@ -33,6 +33,7 @@
> #include "sbc_tables.h"
> 
> #include "sbc_primitives.h"
> +#include "sbc_primitives_sse.h"
> #include "sbc_primitives_mmx.h"
> #include "sbc_primitives_iwmmxt.h"
> #include "sbc_primitives_neon.h"
> @@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
> 	return joint;
> }
> 
> +static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
> +{
> +	__builtin_cpu_init();
> +
> +#ifdef SBC_BUILD_WITH_MMX_SUPPORT
> +	if (__builtin_cpu_supports("mmx"))
> +		sbc_init_primitives_mmx(state);
> +#endif
> +
> +#ifdef SBC_BUILD_WITH_SSE_SUPPORT
> +	if (__builtin_cpu_supports("sse4.2"))
> +		sbc_init_primitives_sse(state);
> +#endif

lets keep the ifdef in the primitive functions they belong. This should be consistent across all primitives and not spread in two places.

Regards

Marcel


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
  2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
@ 2020-08-14 20:56   ` Luiz Augusto von Dentz
  0 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-14 20:56 UTC (permalink / raw)
  To: Marcel Holtmann; +Cc: linux-bluetooth

Hi Marcel,

On Wed, Aug 12, 2020 at 5:48 AM Marcel Holtmann <marcel@holtmann.org> wrote:
>
> Hi Luiz,
>
> >
> > ---
> > Makefile.am              |   1 +
> > sbc/sbc_primitives.c     |  20 ++-
> > sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
> > sbc/sbc_primitives_sse.h |  38 +++++
> > 4 files changed, 417 insertions(+), 3 deletions(-)
> > create mode 100644 sbc/sbc_primitives_sse.c
> > create mode 100644 sbc/sbc_primitives_sse.h
> >
> > diff --git a/Makefile.am b/Makefile.am
> > index 342043d..7ff0c7d 100644
> > --- a/Makefile.am
> > +++ b/Makefile.am
> > @@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
> >
> > sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
> >               sbc/sbc_primitives.h sbc/sbc_primitives.c \
> > +             sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
> >               sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
> >               sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
> >               sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
> > diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
> > index ff343cf..97a75be 100644
> > --- a/sbc/sbc_primitives.c
> > +++ b/sbc/sbc_primitives.c
> > @@ -33,6 +33,7 @@
> > #include "sbc_tables.h"
> >
> > #include "sbc_primitives.h"
> > +#include "sbc_primitives_sse.h"
> > #include "sbc_primitives_mmx.h"
> > #include "sbc_primitives_iwmmxt.h"
> > #include "sbc_primitives_neon.h"
> > @@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
> >       return joint;
> > }
> >
> > +static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
> > +{
> > +     __builtin_cpu_init();
> > +
> > +#ifdef SBC_BUILD_WITH_MMX_SUPPORT
> > +     if (__builtin_cpu_supports("mmx"))
> > +             sbc_init_primitives_mmx(state);
> > +#endif
> > +
> > +#ifdef SBC_BUILD_WITH_SSE_SUPPORT
> > +     if (__builtin_cpu_supports("sse4.2"))
> > +             sbc_init_primitives_sse(state);
> > +#endif
>
> lets keep the ifdef in the primitive functions they belong. This should be consistent across all primitives and not spread in two places.

I guess you mean moving #ifdef SBC_BUILD_WITH_SSE_SUPPORT into
sbc_primitives_sse.c, same for mmx, right? I will fix that, although I
was thinking on not even compiling those files when the config option
is not enabled which would result in undefined symbols, I can however
have alternative versions when its options are not enabled which
basically does nothing.

> Regards
>
> Marcel
>


-- 
Luiz Augusto von Dentz

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-08-14 20:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
2020-08-14 20:56   ` Luiz Augusto von Dentz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.