* [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
@ 2020-08-11 18:16 Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
To: linux-bluetooth
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
Makefile.am | 1 +
sbc/sbc_primitives.c | 20 ++-
sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives_sse.h | 38 +++++
4 files changed, 417 insertions(+), 3 deletions(-)
create mode 100644 sbc/sbc_primitives_sse.c
create mode 100644 sbc/sbc_primitives_sse.h
diff --git a/Makefile.am b/Makefile.am
index 342043d..7ff0c7d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
+ sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index ff343cf..97a75be 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_tables.h"
#include "sbc_primitives.h"
+#include "sbc_primitives_sse.h"
#include "sbc_primitives_mmx.h"
#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
@@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
return joint;
}
+static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
+{
+ __builtin_cpu_init();
+
+#ifdef SBC_BUILD_WITH_MMX_SUPPORT
+ if (__builtin_cpu_supports("mmx"))
+ sbc_init_primitives_mmx(state);
+#endif
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+ if (__builtin_cpu_supports("sse4.2"))
+ sbc_init_primitives_sse(state);
+#endif
+}
+
/*
* Detect CPU features and setup function pointers
*/
@@ -614,9 +630,7 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
state->implementation_info = "Generic C";
/* X86/AMD64 optimizations */
-#ifdef SBC_BUILD_WITH_MMX_SUPPORT
- sbc_init_primitives_mmx(state);
-#endif
+ sbc_init_primitives_x86(state);
/* ARM optimizations */
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
new file mode 100644
index 0000000..c2b729a
--- /dev/null
+++ b/sbc/sbc_primitives_sse.c
@@ -0,0 +1,361 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2020 Intel Corporation
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_sse.h"
+
+/*
+ * SSE optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+
+static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ static const SBC_ALIGNED int32_t round_c[2] = {
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
+ };
+ __asm__ volatile (
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "pmaddwd (%1), %%mm0\n"
+ "pmaddwd 8(%1), %%mm1\n"
+ "paddd (%2), %%mm0\n"
+ "paddd (%2), %%mm1\n"
+ "\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "pmaddwd 16(%1), %%mm2\n"
+ "pmaddwd 24(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 32(%0), %%mm2\n"
+ "movq 40(%0), %%mm3\n"
+ "pmaddwd 32(%1), %%mm2\n"
+ "pmaddwd 40(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 48(%0), %%mm2\n"
+ "movq 56(%0), %%mm3\n"
+ "pmaddwd 48(%1), %%mm2\n"
+ "pmaddwd 56(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 64(%0), %%mm2\n"
+ "movq 72(%0), %%mm3\n"
+ "pmaddwd 64(%1), %%mm2\n"
+ "pmaddwd 72(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "psrad %4, %%mm0\n"
+ "psrad %4, %%mm1\n"
+ "packssdw %%mm0, %%mm0\n"
+ "packssdw %%mm1, %%mm1\n"
+ "\n"
+ "movq %%mm0, %%mm2\n"
+ "pmaddwd 80(%1), %%mm0\n"
+ "pmaddwd 88(%1), %%mm2\n"
+ "\n"
+ "movq %%mm1, %%mm3\n"
+ "pmaddwd 96(%1), %%mm1\n"
+ "pmaddwd 104(%1), %%mm3\n"
+ "paddd %%mm1, %%mm0\n"
+ "paddd %%mm3, %%mm2\n"
+ "\n"
+ "movq %%mm0, (%3)\n"
+ "movq %%mm2, 8(%3)\n"
+ :
+ : "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+ "i" (SBC_PROTO_FIXED4_SCALE)
+ : "cc", "memory");
+}
+
+static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ static const SBC_ALIGNED int32_t round_c[2] = {
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
+ };
+ __asm__ volatile (
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "pmaddwd (%1), %%mm0\n"
+ "pmaddwd 8(%1), %%mm1\n"
+ "pmaddwd 16(%1), %%mm2\n"
+ "pmaddwd 24(%1), %%mm3\n"
+ "paddd (%2), %%mm0\n"
+ "paddd (%2), %%mm1\n"
+ "paddd (%2), %%mm2\n"
+ "paddd (%2), %%mm3\n"
+ "\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ "pmaddwd 32(%1), %%mm4\n"
+ "pmaddwd 40(%1), %%mm5\n"
+ "pmaddwd 48(%1), %%mm6\n"
+ "pmaddwd 56(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 64(%0), %%mm4\n"
+ "movq 72(%0), %%mm5\n"
+ "movq 80(%0), %%mm6\n"
+ "movq 88(%0), %%mm7\n"
+ "pmaddwd 64(%1), %%mm4\n"
+ "pmaddwd 72(%1), %%mm5\n"
+ "pmaddwd 80(%1), %%mm6\n"
+ "pmaddwd 88(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 96(%0), %%mm4\n"
+ "movq 104(%0), %%mm5\n"
+ "movq 112(%0), %%mm6\n"
+ "movq 120(%0), %%mm7\n"
+ "pmaddwd 96(%1), %%mm4\n"
+ "pmaddwd 104(%1), %%mm5\n"
+ "pmaddwd 112(%1), %%mm6\n"
+ "pmaddwd 120(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 128(%0), %%mm4\n"
+ "movq 136(%0), %%mm5\n"
+ "movq 144(%0), %%mm6\n"
+ "movq 152(%0), %%mm7\n"
+ "pmaddwd 128(%1), %%mm4\n"
+ "pmaddwd 136(%1), %%mm5\n"
+ "pmaddwd 144(%1), %%mm6\n"
+ "pmaddwd 152(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "psrad %4, %%mm0\n"
+ "psrad %4, %%mm1\n"
+ "psrad %4, %%mm2\n"
+ "psrad %4, %%mm3\n"
+ "\n"
+ "packssdw %%mm0, %%mm0\n"
+ "packssdw %%mm1, %%mm1\n"
+ "packssdw %%mm2, %%mm2\n"
+ "packssdw %%mm3, %%mm3\n"
+ "\n"
+ "movq %%mm0, %%mm4\n"
+ "movq %%mm0, %%mm5\n"
+ "pmaddwd 160(%1), %%mm4\n"
+ "pmaddwd 168(%1), %%mm5\n"
+ "\n"
+ "movq %%mm1, %%mm6\n"
+ "movq %%mm1, %%mm7\n"
+ "pmaddwd 192(%1), %%mm6\n"
+ "pmaddwd 200(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm2, %%mm6\n"
+ "movq %%mm2, %%mm7\n"
+ "pmaddwd 224(%1), %%mm6\n"
+ "pmaddwd 232(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm3, %%mm6\n"
+ "movq %%mm3, %%mm7\n"
+ "pmaddwd 256(%1), %%mm6\n"
+ "pmaddwd 264(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm4, (%3)\n"
+ "movq %%mm5, 8(%3)\n"
+ "\n"
+ "movq %%mm0, %%mm5\n"
+ "pmaddwd 176(%1), %%mm0\n"
+ "pmaddwd 184(%1), %%mm5\n"
+ "\n"
+ "movq %%mm1, %%mm7\n"
+ "pmaddwd 208(%1), %%mm1\n"
+ "pmaddwd 216(%1), %%mm7\n"
+ "paddd %%mm1, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm2, %%mm7\n"
+ "pmaddwd 240(%1), %%mm2\n"
+ "pmaddwd 248(%1), %%mm7\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm3, %%mm7\n"
+ "pmaddwd 272(%1), %%mm3\n"
+ "pmaddwd 280(%1), %%mm7\n"
+ "paddd %%mm3, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm0, 16(%3)\n"
+ "movq %%mm5, 24(%3)\n"
+ :
+ : "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+ "i" (SBC_PROTO_FIXED8_SCALE)
+ : "cc", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_sse(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_sse(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 0, out, analysis_consts_fixed4_simd_even);
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_4b_8s_sse(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_sse(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 0, out, analysis_consts_fixed8_simd_even);
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride);
+
+static inline void sbc_analyze_1b_8s_sse_odd(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_odd);
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_even;
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_even);
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+
+ __asm__ volatile ("emms\n");
+}
+
+static void sbc_calc_scalefactors_sse(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ static const SBC_ALIGNED int32_t consts[2] = {
+ 1 << SCALE_OUT_BITS,
+ 1 << SCALE_OUT_BITS,
+ };
+ int ch, sb;
+ intptr_t blk;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 2) {
+ blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]));
+ __asm__ volatile (
+ "movq (%4), %%mm0\n"
+ "1:\n"
+ "movq (%1, %0), %%mm1\n"
+ "pxor %%mm2, %%mm2\n"
+ "pcmpgtd %%mm2, %%mm1\n"
+ "paddd (%1, %0), %%mm1\n"
+ "pcmpgtd %%mm1, %%mm2\n"
+ "pxor %%mm2, %%mm1\n"
+
+ "por %%mm1, %%mm0\n"
+
+ "sub %2, %0\n"
+ "jns 1b\n"
+
+ "movd %%mm0, %k0\n"
+ "psrlq $32, %%mm0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, (%3)\n"
+
+ "movd %%mm0, %k0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, 4(%3)\n"
+ : "+r" (blk)
+ : "r" (&sb_sample_f[0][ch][sb]),
+ "i" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ "r" (&scale_factor[ch][sb]),
+ "r" (&consts),
+ "i" (SCALE_OUT_BITS)
+ : "cc", "memory");
+ }
+ }
+ __asm__ volatile ("emms\n");
+}
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4s = sbc_analyze_4b_4s_sse;
+ if (state->increment == 1)
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+ else
+ state->sbc_analyze_8s = sbc_analyze_4b_8s_sse;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_sse;
+ state->implementation_info = "SSE";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_sse.h b/sbc/sbc_primitives_sse.h
new file mode 100644
index 0000000..8830cfd
--- /dev/null
+++ b/sbc/sbc_primitives_sse.h
@@ -0,0 +1,38 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2020 Intel Corporation
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_SSE_H
+#define __SBC_PRIMITIVES_SSE_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_SSE_SUPPORT
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
2.26.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s
2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
@ 2020-08-11 18:16 ` Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
2 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
To: linux-bluetooth
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
This makes use 128 bit XMM registers whenever possible.
$ time src/sbcenc_mmx -s 4 sin_4m.au > /dev/null
real 0m1.073s
user 0m1.039s
sys 0m0.030s
=== After ====
$ time src/sbcenc -s 4 sin_4m.au > /dev/null
real 0m1.049s
user 0m1.000s
sys 0m0.047s
---
sbc/sbc_primitives_sse.c | 58 +++++++++++++++++-----------------------
1 file changed, 25 insertions(+), 33 deletions(-)
diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index c2b729a..6471bd5 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -38,48 +38,40 @@
static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
const FIXED_T *consts)
{
- static const SBC_ALIGNED int32_t round_c[2] = {
+ static const SBC_ALIGNED int32_t round_c[4] = {
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
1 << (SBC_PROTO_FIXED4_SCALE - 1),
1 << (SBC_PROTO_FIXED4_SCALE - 1),
};
__asm__ volatile (
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "pmaddwd (%1), %%mm0\n"
- "pmaddwd 8(%1), %%mm1\n"
- "paddd (%2), %%mm0\n"
- "paddd (%2), %%mm1\n"
+ "movdqu (%0), %%xmm0\n"
+ "pmaddwd (%1), %%xmm0\n"
+ "paddd (%2), %%xmm0\n"
"\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "pmaddwd 16(%1), %%mm2\n"
- "pmaddwd 24(%1), %%mm3\n"
- "paddd %%mm2, %%mm0\n"
- "paddd %%mm3, %%mm1\n"
+ "movdqu 16(%0), %%xmm1\n"
+ "pmaddwd 16(%1), %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
"\n"
- "movq 32(%0), %%mm2\n"
- "movq 40(%0), %%mm3\n"
- "pmaddwd 32(%1), %%mm2\n"
- "pmaddwd 40(%1), %%mm3\n"
- "paddd %%mm2, %%mm0\n"
- "paddd %%mm3, %%mm1\n"
+ "movdqu 32(%0), %%xmm1\n"
+ "pmaddwd 32(%1), %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
"\n"
- "movq 48(%0), %%mm2\n"
- "movq 56(%0), %%mm3\n"
- "pmaddwd 48(%1), %%mm2\n"
- "pmaddwd 56(%1), %%mm3\n"
- "paddd %%mm2, %%mm0\n"
- "paddd %%mm3, %%mm1\n"
+ "movdqu 48(%0), %%xmm1\n"
+ "pmaddwd 48(%1), %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
"\n"
- "movq 64(%0), %%mm2\n"
- "movq 72(%0), %%mm3\n"
- "pmaddwd 64(%1), %%mm2\n"
- "pmaddwd 72(%1), %%mm3\n"
- "paddd %%mm2, %%mm0\n"
- "paddd %%mm3, %%mm1\n"
+ "movdqu 64(%0), %%xmm1\n"
+ "pmaddwd 64(%1), %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "\n"
+ "psrad %4, %%xmm0\n"
+ "\n"
+ "movdqa %%xmm0, %%xmm1\n"
+ "punpckhqdq %%xmm1, %%xmm1\n"
+ "movdq2q %%xmm0, %%mm0\n"
+ "movdq2q %%xmm1, %%mm1\n"
"\n"
- "psrad %4, %%mm0\n"
- "psrad %4, %%mm1\n"
"packssdw %%mm0, %%mm0\n"
"packssdw %%mm1, %%mm1\n"
"\n"
--
2.26.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s
2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
@ 2020-08-11 18:16 ` Luiz Augusto von Dentz
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
2 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-11 18:16 UTC (permalink / raw)
To: linux-bluetooth
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
This makes use 128 bit XMM registers whenever possible.
$ time src/sbcenc_mmx -s 8 sin_64m.au > /dev/null
real 0m1.064s
user 0m1.012s
sys 0m0.049s
=== After ====
$ time src/sbcenc -s 8 sin_64m.au > /dev/null
real 0m1.032s
user 0m0.996s
sys 0m0.033s
---
sbc/sbc_primitives_sse.c | 109 ++++++++++++++++-----------------------
1 file changed, 44 insertions(+), 65 deletions(-)
diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
index 6471bd5..6f8eb49 100644
--- a/sbc/sbc_primitives_sse.c
+++ b/sbc/sbc_primitives_sse.c
@@ -96,80 +96,59 @@ static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
const FIXED_T *consts)
{
- static const SBC_ALIGNED int32_t round_c[2] = {
+ static const SBC_ALIGNED int32_t round_c[4] = {
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
1 << (SBC_PROTO_FIXED8_SCALE - 1),
1 << (SBC_PROTO_FIXED8_SCALE - 1),
};
__asm__ volatile (
- "movq (%0), %%mm0\n"
- "movq 8(%0), %%mm1\n"
- "movq 16(%0), %%mm2\n"
- "movq 24(%0), %%mm3\n"
- "pmaddwd (%1), %%mm0\n"
- "pmaddwd 8(%1), %%mm1\n"
- "pmaddwd 16(%1), %%mm2\n"
- "pmaddwd 24(%1), %%mm3\n"
- "paddd (%2), %%mm0\n"
- "paddd (%2), %%mm1\n"
- "paddd (%2), %%mm2\n"
- "paddd (%2), %%mm3\n"
+ "movdqu (%0), %%xmm0\n"
+ "movdqu 16(%0), %%xmm1\n"
+ "pmaddwd (%1), %%xmm0\n"
+ "pmaddwd 16(%1), %%xmm1\n"
+ "paddd (%2), %%xmm0\n"
+ "paddd (%2), %%xmm1\n"
"\n"
- "movq 32(%0), %%mm4\n"
- "movq 40(%0), %%mm5\n"
- "movq 48(%0), %%mm6\n"
- "movq 56(%0), %%mm7\n"
- "pmaddwd 32(%1), %%mm4\n"
- "pmaddwd 40(%1), %%mm5\n"
- "pmaddwd 48(%1), %%mm6\n"
- "pmaddwd 56(%1), %%mm7\n"
- "paddd %%mm4, %%mm0\n"
- "paddd %%mm5, %%mm1\n"
- "paddd %%mm6, %%mm2\n"
- "paddd %%mm7, %%mm3\n"
+ "movdqu 32(%0), %%xmm2\n"
+ "movdqu 48(%0), %%xmm3\n"
+ "pmaddwd 32(%1), %%xmm2\n"
+ "pmaddwd 48(%1), %%xmm3\n"
+ "paddd %%xmm2, %%xmm0\n"
+ "paddd %%xmm3, %%xmm1\n"
"\n"
- "movq 64(%0), %%mm4\n"
- "movq 72(%0), %%mm5\n"
- "movq 80(%0), %%mm6\n"
- "movq 88(%0), %%mm7\n"
- "pmaddwd 64(%1), %%mm4\n"
- "pmaddwd 72(%1), %%mm5\n"
- "pmaddwd 80(%1), %%mm6\n"
- "pmaddwd 88(%1), %%mm7\n"
- "paddd %%mm4, %%mm0\n"
- "paddd %%mm5, %%mm1\n"
- "paddd %%mm6, %%mm2\n"
- "paddd %%mm7, %%mm3\n"
+ "movdqu 64(%0), %%xmm2\n"
+ "movdqu 80(%0), %%xmm3\n"
+ "pmaddwd 64(%1), %%xmm2\n"
+ "pmaddwd 80(%1), %%xmm3\n"
+ "paddd %%xmm2, %%xmm0\n"
+ "paddd %%xmm3, %%xmm1\n"
"\n"
- "movq 96(%0), %%mm4\n"
- "movq 104(%0), %%mm5\n"
- "movq 112(%0), %%mm6\n"
- "movq 120(%0), %%mm7\n"
- "pmaddwd 96(%1), %%mm4\n"
- "pmaddwd 104(%1), %%mm5\n"
- "pmaddwd 112(%1), %%mm6\n"
- "pmaddwd 120(%1), %%mm7\n"
- "paddd %%mm4, %%mm0\n"
- "paddd %%mm5, %%mm1\n"
- "paddd %%mm6, %%mm2\n"
- "paddd %%mm7, %%mm3\n"
+ "movdqu 96(%0), %%xmm2\n"
+ "movdqu 112(%0), %%xmm3\n"
+ "pmaddwd 96(%1), %%xmm2\n"
+ "pmaddwd 112(%1), %%xmm3\n"
+ "paddd %%xmm2, %%xmm0\n"
+ "paddd %%xmm3, %%xmm1\n"
"\n"
- "movq 128(%0), %%mm4\n"
- "movq 136(%0), %%mm5\n"
- "movq 144(%0), %%mm6\n"
- "movq 152(%0), %%mm7\n"
- "pmaddwd 128(%1), %%mm4\n"
- "pmaddwd 136(%1), %%mm5\n"
- "pmaddwd 144(%1), %%mm6\n"
- "pmaddwd 152(%1), %%mm7\n"
- "paddd %%mm4, %%mm0\n"
- "paddd %%mm5, %%mm1\n"
- "paddd %%mm6, %%mm2\n"
- "paddd %%mm7, %%mm3\n"
+ "movdqu 128(%0), %%xmm2\n"
+ "movdqu 144(%0), %%xmm3\n"
+ "pmaddwd 128(%1), %%xmm2\n"
+ "pmaddwd 144(%1), %%xmm3\n"
+ "paddd %%xmm2, %%xmm0\n"
+ "paddd %%xmm3, %%xmm1\n"
+ "\n"
+ "psrad %4, %%xmm0\n"
+ "psrad %4, %%xmm1\n"
"\n"
- "psrad %4, %%mm0\n"
- "psrad %4, %%mm1\n"
- "psrad %4, %%mm2\n"
- "psrad %4, %%mm3\n"
+ "movdqa %%xmm0, %%xmm2\n"
+ "movdqa %%xmm1, %%xmm3\n"
+ "punpckhqdq %%xmm2, %%xmm2\n"
+ "punpckhqdq %%xmm3, %%xmm3\n"
+ "movdq2q %%xmm0, %%mm0\n"
+ "movdq2q %%xmm2, %%mm1\n"
+ "movdq2q %%xmm1, %%mm2\n"
+ "movdq2q %%xmm3, %%mm3\n"
"\n"
"packssdw %%mm0, %%mm0\n"
"packssdw %%mm1, %%mm1\n"
--
2.26.2
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
@ 2020-08-12 12:48 ` Marcel Holtmann
2020-08-14 20:56 ` Luiz Augusto von Dentz
2 siblings, 1 reply; 5+ messages in thread
From: Marcel Holtmann @ 2020-08-12 12:48 UTC (permalink / raw)
To: Luiz Augusto von Dentz; +Cc: linux-bluetooth
Hi Luiz,
>
> ---
> Makefile.am | 1 +
> sbc/sbc_primitives.c | 20 ++-
> sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
> sbc/sbc_primitives_sse.h | 38 +++++
> 4 files changed, 417 insertions(+), 3 deletions(-)
> create mode 100644 sbc/sbc_primitives_sse.c
> create mode 100644 sbc/sbc_primitives_sse.h
>
> diff --git a/Makefile.am b/Makefile.am
> index 342043d..7ff0c7d 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
>
> sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
> sbc/sbc_primitives.h sbc/sbc_primitives.c \
> + sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
> sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
> sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
> sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
> diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
> index ff343cf..97a75be 100644
> --- a/sbc/sbc_primitives.c
> +++ b/sbc/sbc_primitives.c
> @@ -33,6 +33,7 @@
> #include "sbc_tables.h"
>
> #include "sbc_primitives.h"
> +#include "sbc_primitives_sse.h"
> #include "sbc_primitives_mmx.h"
> #include "sbc_primitives_iwmmxt.h"
> #include "sbc_primitives_neon.h"
> @@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
> return joint;
> }
>
> +static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
> +{
> + __builtin_cpu_init();
> +
> +#ifdef SBC_BUILD_WITH_MMX_SUPPORT
> + if (__builtin_cpu_supports("mmx"))
> + sbc_init_primitives_mmx(state);
> +#endif
> +
> +#ifdef SBC_BUILD_WITH_SSE_SUPPORT
> + if (__builtin_cpu_supports("sse4.2"))
> + sbc_init_primitives_sse(state);
> +#endif
lets keep the ifdef in the primitive functions they belong. This should be consistent across all primitives and not spread in two places.
Regards
Marcel
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
@ 2020-08-14 20:56 ` Luiz Augusto von Dentz
0 siblings, 0 replies; 5+ messages in thread
From: Luiz Augusto von Dentz @ 2020-08-14 20:56 UTC (permalink / raw)
To: Marcel Holtmann; +Cc: linux-bluetooth
Hi Marcel,
On Wed, Aug 12, 2020 at 5:48 AM Marcel Holtmann <marcel@holtmann.org> wrote:
>
> Hi Luiz,
>
> >
> > ---
> > Makefile.am | 1 +
> > sbc/sbc_primitives.c | 20 ++-
> > sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
> > sbc/sbc_primitives_sse.h | 38 +++++
> > 4 files changed, 417 insertions(+), 3 deletions(-)
> > create mode 100644 sbc/sbc_primitives_sse.c
> > create mode 100644 sbc/sbc_primitives_sse.h
> >
> > diff --git a/Makefile.am b/Makefile.am
> > index 342043d..7ff0c7d 100644
> > --- a/Makefile.am
> > +++ b/Makefile.am
> > @@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
> >
> > sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
> > sbc/sbc_primitives.h sbc/sbc_primitives.c \
> > + sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
> > sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
> > sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
> > sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
> > diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
> > index ff343cf..97a75be 100644
> > --- a/sbc/sbc_primitives.c
> > +++ b/sbc/sbc_primitives.c
> > @@ -33,6 +33,7 @@
> > #include "sbc_tables.h"
> >
> > #include "sbc_primitives.h"
> > +#include "sbc_primitives_sse.h"
> > #include "sbc_primitives_mmx.h"
> > #include "sbc_primitives_iwmmxt.h"
> > #include "sbc_primitives_neon.h"
> > @@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
> > return joint;
> > }
> >
> > +static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
> > +{
> > + __builtin_cpu_init();
> > +
> > +#ifdef SBC_BUILD_WITH_MMX_SUPPORT
> > + if (__builtin_cpu_supports("mmx"))
> > + sbc_init_primitives_mmx(state);
> > +#endif
> > +
> > +#ifdef SBC_BUILD_WITH_SSE_SUPPORT
> > + if (__builtin_cpu_supports("sse4.2"))
> > + sbc_init_primitives_sse(state);
> > +#endif
>
> lets keep the ifdef in the primitive functions they belong. This should be consistent across all primitives and not spread in two places.
I guess you mean moving #ifdef SBC_BUILD_WITH_SSE_SUPPORT into
sbc_primitives_sse.c, same for mmx, right? I will fix that, although I
was thinking on not even compiling those files when the config option
is not enabled which would result in undefined symbols, I can however
have alternative versions when its options are not enabled which
basically does nothing.
> Regards
>
> Marcel
>
--
Luiz Augusto von Dentz
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2020-08-14 20:57 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-11 18:16 [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
2020-08-14 20:56 ` Luiz Augusto von Dentz
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.