Giter VIP home page Giter VIP logo

Comments (2)

p0nce avatar p0nce commented on June 4, 2024

Conclusion: can make a whole product by about 8%, if AVX2 instructions are effectively enabled.
Not measured on arm64, probably not as helpful at all, probably -3%.
But the compatibility cost is so high (we can't really go above SSE3) that it's not worth it in real products.

from dplug.

p0nce avatar p0nce commented on June 4, 2024

Patch for dplug:fft, would te topic be revived.

module dplug.fft.avx_float;

import inteli.avx2intrin;
import dplug.fft.fft_impl;

nothrow @nogc:

struct Vector 
{
nothrow @nogc:
    alias vec = float8; // From intel-intrinsics, not core.simd
    alias T = float;

    enum vec_size = 8;

    enum log2_bitreverse_chunk_size = 3;

    static auto v(T* p){ return cast(float4*) p; }
    static auto v8(T* p){ return cast(float8*) p; }

    static void _deinterleave2(vec a0, vec a1, ref vec r0, ref vec r1)
    {
        __m128 a0_lo = _mm256_extractf128_ps!0(a0);
        __m128 a0_hi = _mm256_extractf128_ps!1(a0);
        __m128 a1_lo = _mm256_extractf128_ps!0(a1);
        __m128 a1_hi = _mm256_extractf128_ps!1(a1);
        r0 = _mm256_set_m128(a1_lo, a0_lo);
        r1 = _mm256_set_m128(a1_hi, a0_hi);
    }

    // the three functions below do not do exactly what the names imply, but that's
    // ok (fft works correctly when using them)

    static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri)
    {
        static if(n == 8)
        {
            deinterleave(v8(arr)[0], v8(arr)[1], rr, ri); 
        }
        else static if (n == 4)
        {
            vec a = *v8(arr);
            rr = shufps!(2, 2, 0, 0)(a, a);
            ri = shufps!(3, 3, 1, 1)(a, a);
        }
        else static if(n == 2)
        {
            rr = insert128_0(rr, arr[0]);
            rr = insert128_1(rr, arr[2]);
            ri = insert128_0(ri,  arr[1]);
            ri = insert128_1(ri,  arr[3]);
        }
        else
            static assert(0);
    }

    static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
    {
        vec a0_tmp = _mm256_unpacklo_ps (a0, a1);
        a1 =         _mm256_unpackhi_ps (a0, a1);
        _deinterleave2(a0_tmp, a1, r0, r1);
    }

    static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
    {
        _deinterleave2(a0, a1, a0, a1); 
        r0 = shufps!(2,0,2,0)(a0, a1);
        r1 = shufps!(3,1,3,1)(a0, a1);
    }

        static void transpose(int elements_per_vector)(
                                                       vec a0, vec a1, ref vec r0, ref vec r1)
        {
            static if(elements_per_vector == 8)
            {
                r0 = shufps!(2,0,2,0)(a0, a1);
                r1 = shufps!(3,1,3,1)(a0, a1);
                r0 = shufps!(3,1,2,0)(r0, r0);
                r1 = shufps!(3,1,2,0)(r1, r1);
            }
            else static if(elements_per_vector == 4)
            {
                r0 = shufps!(1,0,1,0)(a0, a1);
                r1 = shufps!(3,2,3,2)(a0, a1);
            }
            else static if(elements_per_vector == 2)
            {
                r0 = interleave128_lo(a0, a1);
                r1 = interleave128_hi(a0, a1);
            }
            else
                static assert(0);
        }

       private static void br16_two(ref vec a0, ref vec a1, ref vec a2, ref vec a3)
        {
            vec b0 = shufps!(1, 0, 1, 0)(a0, a2);
            vec b1 = shufps!(1, 0, 1, 0)(a1, a3);
            vec b2 = shufps!(3, 2, 3, 2)(a0, a2);
            vec b3 = shufps!(3, 2, 3, 2)(a1, a3);

            a0 = shufps!(2, 0, 2, 0)(b0, b1);
            a1 = shufps!(2, 0, 2, 0)(b2, b3);
            a2 = shufps!(3, 1, 3, 1)(b0, b1);
            a3 = shufps!(3, 1, 3, 1)(b2, b3);
        }

        private static void br64(
                                 ref vec a0, ref vec a1, ref vec a2, ref vec a3,
                                 ref vec a4, ref vec a5, ref vec a6, ref vec a7)
        {
            // reverse the outer four bits 
            br16_two(a0, a2, a4, a6);
            br16_two(a1, a3, a5, a7);

            // reverse the inner two bits
            _deinterleave2(a0, a1, a0, a1); 
            _deinterleave2(a2, a3, a2, a3); 
            _deinterleave2(a4, a5, a4, a5); 
            _deinterleave2(a6, a7, a6, a7); 
        }

    template RepeatType(T, int n, R...)
    {
        static if(n == 0)
            alias R RepeatType;
        else
            alias RepeatType!(T, n - 1, T, R) RepeatType;
    }

    static void bit_reverse_swap(T* p0, T* p1, size_t m)
    {
        RepeatType!(vec, 8) a, b;    

        foreach(i, _; a)
            a[i] = *v8(p0 + i * m);

        br64(a);

        foreach(i, _; a)
            b[i] = *v8(p1 + i * m);

        foreach(i, _; a)
            *v8(p1 + i * m) = a[i];

        br64(b);

        foreach(i, _; a)
            *v8(p0 + i * m) = b[i];
    }

    static void bit_reverse(T* p0, size_t m)
    {
        RepeatType!(vec, 8) a;    

        foreach(i, _; a)
            a[i] = *v8(p0 + i * m);

        br64(a);

        foreach(i, _; a)
            *v8(p0 + i * m) = a[i];
    }

    static vec scalar_to_vector(T a)
    {
        return a;
    }

    static vec unaligned_load(T* p)
    {
        return _mm256_loadu_ps(p);
    }

    static void unaligned_store(T* p, vec v)
    {
        _mm256_storeu_ps(p, v);
    }

    static vec reverse(vec v)
    {
        v = shufps!(0, 1, 2, 3)(v, v);
        return reverse128(v);
    }

    static vec shufps(int i3, int i2, int i1, int i0)(vec a, vec b)
    {
        enum shuf_mask = i0 | (i1<<2) | (i2<<4) | (i3<<6); 
        return _mm256_shuffle_ps!shuf_mask(a, b);
    }
    
    static vec reverse128(vec v)
    {
        __m128 v_lo = _mm256_extractf128_ps!0(v);
        __m128 v_hi = _mm256_extractf128_ps!1(v);
        return _mm256_set_m128(v_lo, v_hi);
    }
}

struct Options
{
    enum log2_bitreverse_large_chunk_size = 5;
    enum large_limit = 14;
    enum log2_optimal_n = 8;
    enum passes_per_recursive_call = 4;
    enum log2_recursive_passes_chunk_size = 4;
    enum prefered_alignment = 4 * (1 << 10);
}

from dplug.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.