Comments (2)
Conclusion: can make a whole product by about 8%, if AVX2 instructions are effectively enabled.
Not measured on arm64, probably not as helpful at all, probably -3%.
But the compatibility cost is so high (we can't really go above SSE3) that it's not worth it in real products.
from dplug.
Patch for dplug:fft
, would te topic be revived.
module dplug.fft.avx_float;
import inteli.avx2intrin;
import dplug.fft.fft_impl;
nothrow @nogc:
struct Vector
{
nothrow @nogc:
alias vec = float8; // From intel-intrinsics, not core.simd
alias T = float;
enum vec_size = 8;
enum log2_bitreverse_chunk_size = 3;
static auto v(T* p){ return cast(float4*) p; }
static auto v8(T* p){ return cast(float8*) p; }
static void _deinterleave2(vec a0, vec a1, ref vec r0, ref vec r1)
{
__m128 a0_lo = _mm256_extractf128_ps!0(a0);
__m128 a0_hi = _mm256_extractf128_ps!1(a0);
__m128 a1_lo = _mm256_extractf128_ps!0(a1);
__m128 a1_hi = _mm256_extractf128_ps!1(a1);
r0 = _mm256_set_m128(a1_lo, a0_lo);
r1 = _mm256_set_m128(a1_hi, a0_hi);
}
// the three functions below do not do exactly what the names imply, but that's
// ok (fft works correctly when using them)
static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri)
{
static if(n == 8)
{
deinterleave(v8(arr)[0], v8(arr)[1], rr, ri);
}
else static if (n == 4)
{
vec a = *v8(arr);
rr = shufps!(2, 2, 0, 0)(a, a);
ri = shufps!(3, 3, 1, 1)(a, a);
}
else static if(n == 2)
{
rr = insert128_0(rr, arr[0]);
rr = insert128_1(rr, arr[2]);
ri = insert128_0(ri, arr[1]);
ri = insert128_1(ri, arr[3]);
}
else
static assert(0);
}
static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
{
vec a0_tmp = _mm256_unpacklo_ps (a0, a1);
a1 = _mm256_unpackhi_ps (a0, a1);
_deinterleave2(a0_tmp, a1, r0, r1);
}
static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
{
_deinterleave2(a0, a1, a0, a1);
r0 = shufps!(2,0,2,0)(a0, a1);
r1 = shufps!(3,1,3,1)(a0, a1);
}
static void transpose(int elements_per_vector)(
vec a0, vec a1, ref vec r0, ref vec r1)
{
static if(elements_per_vector == 8)
{
r0 = shufps!(2,0,2,0)(a0, a1);
r1 = shufps!(3,1,3,1)(a0, a1);
r0 = shufps!(3,1,2,0)(r0, r0);
r1 = shufps!(3,1,2,0)(r1, r1);
}
else static if(elements_per_vector == 4)
{
r0 = shufps!(1,0,1,0)(a0, a1);
r1 = shufps!(3,2,3,2)(a0, a1);
}
else static if(elements_per_vector == 2)
{
r0 = interleave128_lo(a0, a1);
r1 = interleave128_hi(a0, a1);
}
else
static assert(0);
}
private static void br16_two(ref vec a0, ref vec a1, ref vec a2, ref vec a3)
{
vec b0 = shufps!(1, 0, 1, 0)(a0, a2);
vec b1 = shufps!(1, 0, 1, 0)(a1, a3);
vec b2 = shufps!(3, 2, 3, 2)(a0, a2);
vec b3 = shufps!(3, 2, 3, 2)(a1, a3);
a0 = shufps!(2, 0, 2, 0)(b0, b1);
a1 = shufps!(2, 0, 2, 0)(b2, b3);
a2 = shufps!(3, 1, 3, 1)(b0, b1);
a3 = shufps!(3, 1, 3, 1)(b2, b3);
}
private static void br64(
ref vec a0, ref vec a1, ref vec a2, ref vec a3,
ref vec a4, ref vec a5, ref vec a6, ref vec a7)
{
// reverse the outer four bits
br16_two(a0, a2, a4, a6);
br16_two(a1, a3, a5, a7);
// reverse the inner two bits
_deinterleave2(a0, a1, a0, a1);
_deinterleave2(a2, a3, a2, a3);
_deinterleave2(a4, a5, a4, a5);
_deinterleave2(a6, a7, a6, a7);
}
template RepeatType(T, int n, R...)
{
static if(n == 0)
alias R RepeatType;
else
alias RepeatType!(T, n - 1, T, R) RepeatType;
}
static void bit_reverse_swap(T* p0, T* p1, size_t m)
{
RepeatType!(vec, 8) a, b;
foreach(i, _; a)
a[i] = *v8(p0 + i * m);
br64(a);
foreach(i, _; a)
b[i] = *v8(p1 + i * m);
foreach(i, _; a)
*v8(p1 + i * m) = a[i];
br64(b);
foreach(i, _; a)
*v8(p0 + i * m) = b[i];
}
static void bit_reverse(T* p0, size_t m)
{
RepeatType!(vec, 8) a;
foreach(i, _; a)
a[i] = *v8(p0 + i * m);
br64(a);
foreach(i, _; a)
*v8(p0 + i * m) = a[i];
}
static vec scalar_to_vector(T a)
{
return a;
}
static vec unaligned_load(T* p)
{
return _mm256_loadu_ps(p);
}
static void unaligned_store(T* p, vec v)
{
_mm256_storeu_ps(p, v);
}
static vec reverse(vec v)
{
v = shufps!(0, 1, 2, 3)(v, v);
return reverse128(v);
}
static vec shufps(int i3, int i2, int i1, int i0)(vec a, vec b)
{
enum shuf_mask = i0 | (i1<<2) | (i2<<4) | (i3<<6);
return _mm256_shuffle_ps!shuf_mask(a, b);
}
static vec reverse128(vec v)
{
__m128 v_lo = _mm256_extractf128_ps!0(v);
__m128 v_hi = _mm256_extractf128_ps!1(v);
return _mm256_set_m128(v_lo, v_hi);
}
}
struct Options
{
enum log2_bitreverse_large_chunk_size = 5;
enum large_limit = 14;
enum log2_optimal_n = 8;
enum passes_per_recursive_call = 4;
enum log2_recursive_passes_chunk_size = 4;
enum prefered_alignment = 4 * (1 << 10);
}
from dplug.
Related Issues (20)
- Linear RGB renderer
- Some Monterey and Ventura macOS deletes the Mac and Windows installer after ZIP download HOT 4
- Subnormal problem? In large session. HOT 1
- Wiki: Better env var GUI hint in Dplug VST2 Guide HOT 3
- More macOS crashes , like issue 741 HOT 2
- Bad fix for macOS bug? Revealed in Live 12 HOT 6
- More Windows installer convenience HOT 1
- Cubase Pro 13 + unknown format + Windows 10 + VST3 => not recognized, can't escape blacklist
- REAPER + AU => no latency reporting? HOT 3
- destroyNoGC(interface) HOT 1
- Garageband 10.4 + Sonoma 14.2.1 + AU + Graillon 2.8(?) => no show HOT 4
- Windows + Cubase v13 + VST3 + Inner Pitch 1.1 HOT 1
- 1-1 mono + Inner Pitch => crash HOT 3
- Consider other build options
- AAX + macOS Catalina + Panagement 2.6 => instacrash
- Graillon 2.8 + Windows 11 + Protools 12.6 => not installing!
- usability HOT 1
- OnOffSwitch doesn't respect dirtyRect
- Use newer stb_image_resize2.h
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from dplug.