From: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
To: linux-bluetooth@vger.kernel.org
Subject: [PATCH SBC 1/3] sbc: Add initial code for SSE primitives
Date: Tue, 11 Aug 2020 11:16:21 -0700 [thread overview]
Message-ID: <20200811181623.3683374-1-luiz.dentz@gmail.com> (raw)
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
Makefile.am | 1 +
sbc/sbc_primitives.c | 20 ++-
sbc/sbc_primitives_sse.c | 361 +++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives_sse.h | 38 +++++
4 files changed, 417 insertions(+), 3 deletions(-)
create mode 100644 sbc/sbc_primitives_sse.c
create mode 100644 sbc/sbc_primitives_sse.h
diff --git a/Makefile.am b/Makefile.am
index 342043d..7ff0c7d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,6 +14,7 @@ sbc_headers = sbc/sbc.h
sbc_sources = sbc/sbc.c sbc/sbc_private.h sbc/sbc_math.h sbc/sbc_tables.h \
sbc/sbc_primitives.h sbc/sbc_primitives.c \
+ sbc/sbc_primitives_sse.h sbc/sbc_primitives_sse.c \
sbc/sbc_primitives_mmx.h sbc/sbc_primitives_mmx.c \
sbc/sbc_primitives_iwmmxt.h sbc/sbc_primitives_iwmmxt.c \
sbc/sbc_primitives_neon.h sbc/sbc_primitives_neon.c \
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index ff343cf..97a75be 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -33,6 +33,7 @@
#include "sbc_tables.h"
#include "sbc_primitives.h"
+#include "sbc_primitives_sse.h"
#include "sbc_primitives_mmx.h"
#include "sbc_primitives_iwmmxt.h"
#include "sbc_primitives_neon.h"
@@ -590,6 +591,21 @@ static int sbc_calc_scalefactors_j(
return joint;
}
+static void sbc_init_primitives_x86(struct sbc_encoder_state *state)
+{
+ __builtin_cpu_init();
+
+#ifdef SBC_BUILD_WITH_MMX_SUPPORT
+ if (__builtin_cpu_supports("mmx"))
+ sbc_init_primitives_mmx(state);
+#endif
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+ if (__builtin_cpu_supports("sse4.2"))
+ sbc_init_primitives_sse(state);
+#endif
+}
+
/*
* Detect CPU features and setup function pointers
*/
@@ -614,9 +630,7 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
state->implementation_info = "Generic C";
/* X86/AMD64 optimizations */
-#ifdef SBC_BUILD_WITH_MMX_SUPPORT
- sbc_init_primitives_mmx(state);
-#endif
+ sbc_init_primitives_x86(state);
/* ARM optimizations */
#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT
diff --git a/sbc/sbc_primitives_sse.c b/sbc/sbc_primitives_sse.c
new file mode 100644
index 0000000..c2b729a
--- /dev/null
+++ b/sbc/sbc_primitives_sse.c
@@ -0,0 +1,361 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2020 Intel Corporation
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdint.h>
+#include <limits.h>
+#include "sbc.h"
+#include "sbc_math.h"
+#include "sbc_tables.h"
+
+#include "sbc_primitives_sse.h"
+
+/*
+ * SSE optimizations
+ */
+
+#ifdef SBC_BUILD_WITH_SSE_SUPPORT
+
+static inline void sbc_analyze_four_sse(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ static const SBC_ALIGNED int32_t round_c[2] = {
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED4_SCALE - 1),
+ };
+ __asm__ volatile (
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "pmaddwd (%1), %%mm0\n"
+ "pmaddwd 8(%1), %%mm1\n"
+ "paddd (%2), %%mm0\n"
+ "paddd (%2), %%mm1\n"
+ "\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "pmaddwd 16(%1), %%mm2\n"
+ "pmaddwd 24(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 32(%0), %%mm2\n"
+ "movq 40(%0), %%mm3\n"
+ "pmaddwd 32(%1), %%mm2\n"
+ "pmaddwd 40(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 48(%0), %%mm2\n"
+ "movq 56(%0), %%mm3\n"
+ "pmaddwd 48(%1), %%mm2\n"
+ "pmaddwd 56(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "movq 64(%0), %%mm2\n"
+ "movq 72(%0), %%mm3\n"
+ "pmaddwd 64(%1), %%mm2\n"
+ "pmaddwd 72(%1), %%mm3\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm3, %%mm1\n"
+ "\n"
+ "psrad %4, %%mm0\n"
+ "psrad %4, %%mm1\n"
+ "packssdw %%mm0, %%mm0\n"
+ "packssdw %%mm1, %%mm1\n"
+ "\n"
+ "movq %%mm0, %%mm2\n"
+ "pmaddwd 80(%1), %%mm0\n"
+ "pmaddwd 88(%1), %%mm2\n"
+ "\n"
+ "movq %%mm1, %%mm3\n"
+ "pmaddwd 96(%1), %%mm1\n"
+ "pmaddwd 104(%1), %%mm3\n"
+ "paddd %%mm1, %%mm0\n"
+ "paddd %%mm3, %%mm2\n"
+ "\n"
+ "movq %%mm0, (%3)\n"
+ "movq %%mm2, 8(%3)\n"
+ :
+ : "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+ "i" (SBC_PROTO_FIXED4_SCALE)
+ : "cc", "memory");
+}
+
+static inline void sbc_analyze_eight_sse(const int16_t *in, int32_t *out,
+ const FIXED_T *consts)
+{
+ static const SBC_ALIGNED int32_t round_c[2] = {
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
+ 1 << (SBC_PROTO_FIXED8_SCALE - 1),
+ };
+ __asm__ volatile (
+ "movq (%0), %%mm0\n"
+ "movq 8(%0), %%mm1\n"
+ "movq 16(%0), %%mm2\n"
+ "movq 24(%0), %%mm3\n"
+ "pmaddwd (%1), %%mm0\n"
+ "pmaddwd 8(%1), %%mm1\n"
+ "pmaddwd 16(%1), %%mm2\n"
+ "pmaddwd 24(%1), %%mm3\n"
+ "paddd (%2), %%mm0\n"
+ "paddd (%2), %%mm1\n"
+ "paddd (%2), %%mm2\n"
+ "paddd (%2), %%mm3\n"
+ "\n"
+ "movq 32(%0), %%mm4\n"
+ "movq 40(%0), %%mm5\n"
+ "movq 48(%0), %%mm6\n"
+ "movq 56(%0), %%mm7\n"
+ "pmaddwd 32(%1), %%mm4\n"
+ "pmaddwd 40(%1), %%mm5\n"
+ "pmaddwd 48(%1), %%mm6\n"
+ "pmaddwd 56(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 64(%0), %%mm4\n"
+ "movq 72(%0), %%mm5\n"
+ "movq 80(%0), %%mm6\n"
+ "movq 88(%0), %%mm7\n"
+ "pmaddwd 64(%1), %%mm4\n"
+ "pmaddwd 72(%1), %%mm5\n"
+ "pmaddwd 80(%1), %%mm6\n"
+ "pmaddwd 88(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 96(%0), %%mm4\n"
+ "movq 104(%0), %%mm5\n"
+ "movq 112(%0), %%mm6\n"
+ "movq 120(%0), %%mm7\n"
+ "pmaddwd 96(%1), %%mm4\n"
+ "pmaddwd 104(%1), %%mm5\n"
+ "pmaddwd 112(%1), %%mm6\n"
+ "pmaddwd 120(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "movq 128(%0), %%mm4\n"
+ "movq 136(%0), %%mm5\n"
+ "movq 144(%0), %%mm6\n"
+ "movq 152(%0), %%mm7\n"
+ "pmaddwd 128(%1), %%mm4\n"
+ "pmaddwd 136(%1), %%mm5\n"
+ "pmaddwd 144(%1), %%mm6\n"
+ "pmaddwd 152(%1), %%mm7\n"
+ "paddd %%mm4, %%mm0\n"
+ "paddd %%mm5, %%mm1\n"
+ "paddd %%mm6, %%mm2\n"
+ "paddd %%mm7, %%mm3\n"
+ "\n"
+ "psrad %4, %%mm0\n"
+ "psrad %4, %%mm1\n"
+ "psrad %4, %%mm2\n"
+ "psrad %4, %%mm3\n"
+ "\n"
+ "packssdw %%mm0, %%mm0\n"
+ "packssdw %%mm1, %%mm1\n"
+ "packssdw %%mm2, %%mm2\n"
+ "packssdw %%mm3, %%mm3\n"
+ "\n"
+ "movq %%mm0, %%mm4\n"
+ "movq %%mm0, %%mm5\n"
+ "pmaddwd 160(%1), %%mm4\n"
+ "pmaddwd 168(%1), %%mm5\n"
+ "\n"
+ "movq %%mm1, %%mm6\n"
+ "movq %%mm1, %%mm7\n"
+ "pmaddwd 192(%1), %%mm6\n"
+ "pmaddwd 200(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm2, %%mm6\n"
+ "movq %%mm2, %%mm7\n"
+ "pmaddwd 224(%1), %%mm6\n"
+ "pmaddwd 232(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm3, %%mm6\n"
+ "movq %%mm3, %%mm7\n"
+ "pmaddwd 256(%1), %%mm6\n"
+ "pmaddwd 264(%1), %%mm7\n"
+ "paddd %%mm6, %%mm4\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm4, (%3)\n"
+ "movq %%mm5, 8(%3)\n"
+ "\n"
+ "movq %%mm0, %%mm5\n"
+ "pmaddwd 176(%1), %%mm0\n"
+ "pmaddwd 184(%1), %%mm5\n"
+ "\n"
+ "movq %%mm1, %%mm7\n"
+ "pmaddwd 208(%1), %%mm1\n"
+ "pmaddwd 216(%1), %%mm7\n"
+ "paddd %%mm1, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm2, %%mm7\n"
+ "pmaddwd 240(%1), %%mm2\n"
+ "pmaddwd 248(%1), %%mm7\n"
+ "paddd %%mm2, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm3, %%mm7\n"
+ "pmaddwd 272(%1), %%mm3\n"
+ "pmaddwd 280(%1), %%mm7\n"
+ "paddd %%mm3, %%mm0\n"
+ "paddd %%mm7, %%mm5\n"
+ "\n"
+ "movq %%mm0, 16(%3)\n"
+ "movq %%mm5, 24(%3)\n"
+ :
+ : "r" (in), "r" (consts), "r" (&round_c), "r" (out),
+ "i" (SBC_PROTO_FIXED8_SCALE)
+ : "cc", "memory");
+}
+
+static inline void sbc_analyze_4b_4s_sse(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_four_sse(x + 12, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 8, out, analysis_consts_fixed4_simd_even);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 4, out, analysis_consts_fixed4_simd_odd);
+ out += out_stride;
+ sbc_analyze_four_sse(x + 0, out, analysis_consts_fixed4_simd_even);
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_4b_8s_sse(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ /* Analyze blocks */
+ sbc_analyze_eight_sse(x + 24, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 16, out, analysis_consts_fixed8_simd_even);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 8, out, analysis_consts_fixed8_simd_odd);
+ out += out_stride;
+ sbc_analyze_eight_sse(x + 0, out, analysis_consts_fixed8_simd_even);
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride);
+
+static inline void sbc_analyze_1b_8s_sse_odd(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_odd);
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_even;
+
+ __asm__ volatile ("emms\n");
+}
+
+static inline void sbc_analyze_1b_8s_sse_even(struct sbc_encoder_state *state,
+ int16_t *x, int32_t *out, int out_stride)
+{
+ sbc_analyze_eight_sse(x, out, analysis_consts_fixed8_simd_even);
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+
+ __asm__ volatile ("emms\n");
+}
+
+static void sbc_calc_scalefactors_sse(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ static const SBC_ALIGNED int32_t consts[2] = {
+ 1 << SCALE_OUT_BITS,
+ 1 << SCALE_OUT_BITS,
+ };
+ int ch, sb;
+ intptr_t blk;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 2) {
+ blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]));
+ __asm__ volatile (
+ "movq (%4), %%mm0\n"
+ "1:\n"
+ "movq (%1, %0), %%mm1\n"
+ "pxor %%mm2, %%mm2\n"
+ "pcmpgtd %%mm2, %%mm1\n"
+ "paddd (%1, %0), %%mm1\n"
+ "pcmpgtd %%mm1, %%mm2\n"
+ "pxor %%mm2, %%mm1\n"
+
+ "por %%mm1, %%mm0\n"
+
+ "sub %2, %0\n"
+ "jns 1b\n"
+
+ "movd %%mm0, %k0\n"
+ "psrlq $32, %%mm0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, (%3)\n"
+
+ "movd %%mm0, %k0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, 4(%3)\n"
+ : "+r" (blk)
+ : "r" (&sb_sample_f[0][ch][sb]),
+ "i" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ "r" (&scale_factor[ch][sb]),
+ "r" (&consts),
+ "i" (SCALE_OUT_BITS)
+ : "cc", "memory");
+ }
+ }
+ __asm__ volatile ("emms\n");
+}
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *state)
+{
+ state->sbc_analyze_4s = sbc_analyze_4b_4s_sse;
+ if (state->increment == 1)
+ state->sbc_analyze_8s = sbc_analyze_1b_8s_sse_odd;
+ else
+ state->sbc_analyze_8s = sbc_analyze_4b_8s_sse;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_sse;
+ state->implementation_info = "SSE";
+}
+
+#endif
diff --git a/sbc/sbc_primitives_sse.h b/sbc/sbc_primitives_sse.h
new file mode 100644
index 0000000..8830cfd
--- /dev/null
+++ b/sbc/sbc_primitives_sse.h
@@ -0,0 +1,38 @@
+/*
+ *
+ * Bluetooth low-complexity, subband codec (SBC) library
+ *
+ * Copyright (C) 2020 Intel Corporation
+ *
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef __SBC_PRIMITIVES_SSE_H
+#define __SBC_PRIMITIVES_SSE_H
+
+#include "sbc_primitives.h"
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) && \
+ !defined(SBC_HIGH_PRECISION) && (SCALE_OUT_BITS == 15)
+
+#define SBC_BUILD_WITH_SSE_SUPPORT
+
+void sbc_init_primitives_sse(struct sbc_encoder_state *encoder_state);
+
+#endif
+
+#endif
--
2.26.2
next reply other threads:[~2020-08-11 18:16 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-11 18:16 Luiz Augusto von Dentz [this message]
2020-08-11 18:16 ` [PATCH SBC 2/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_4s Luiz Augusto von Dentz
2020-08-11 18:16 ` [PATCH SBC 3/3] sbc/sbc_primitives_sse: Optimize sbc_analyze_8s Luiz Augusto von Dentz
2020-08-12 12:48 ` [PATCH SBC 1/3] sbc: Add initial code for SSE primitives Marcel Holtmann
2020-08-14 20:56 ` Luiz Augusto von Dentz
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200811181623.3683374-1-luiz.dentz@gmail.com \
--to=luiz.dentz@gmail.com \
--cc=linux-bluetooth@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.