Giter VIP home page Giter VIP logo

Comments (2)

postzhang123 avatar postzhang123 commented on June 2, 2024

补充,该函数的整体代码,void _AVX_MNNPackCUnit(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
auto areaC4 = area / PACK_UNIT;
auto depthC4 = depth / PACK_UNIT;
auto srcAreaOffset = areaOffset[0];
auto dstAreaOffset = areaOffset[1];
__m256 t0, t1, t2, t3, t4, t5, t6, t7;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * dstAreaOffset * PACK_UNIT;
auto srcPlane = src + z * srcAreaOffset * PACK_UNIT;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + PACK_UNIT * x;
auto d = dstPlane + PACK_UNIT * PACK_UNIT * x;
auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
auto r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
auto r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
auto r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
auto r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
auto r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
auto r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
auto r7 = _mm256_loadu_ps(s + 7 * srcAreaOffset);

        TRANSPOSE_8x8;

        _mm256_storeu_ps(d + PACK_UNIT * 0, t0);
        _mm256_storeu_ps(d + PACK_UNIT * 1, t1);
        _mm256_storeu_ps(d + PACK_UNIT * 2, t2);
        _mm256_storeu_ps(d + PACK_UNIT * 3, t3);
        _mm256_storeu_ps(d + PACK_UNIT * 4, t4);
        _mm256_storeu_ps(d + PACK_UNIT * 5, t5);
        _mm256_storeu_ps(d + PACK_UNIT * 6, t6);
        _mm256_storeu_ps(d + PACK_UNIT * 7, t7);
    }
}
auto areaRemain  = areaC4 * PACK_UNIT;
auto depthRemain = depthC4 * PACK_UNIT;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
    float* dstPlane       = depthC4 * dstAreaOffset * PACK_UNIT + dst;
    const float* srcPlane = src + depthC4 * srcAreaOffset * PACK_UNIT;
    {
        for (int x = 0; x < areaC4; ++x) {
            auto s  = srcPlane + PACK_UNIT * x;
            auto d  = dstPlane + PACK_UNIT * PACK_UNIT * x;
            auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
            auto r1 = _mm256_setzero_ps();
            auto r2 = _mm256_setzero_ps();
            auto r3 = _mm256_setzero_ps();
            auto r4 = _mm256_setzero_ps();
            auto r5 = _mm256_setzero_ps();
            auto r6 = _mm256_setzero_ps();
            auto r7 = _mm256_setzero_ps();
            switch (remain) {
                case 7:
                    r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
                case 6:
                    r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
                case 5:
                    r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
                case 4:
                    r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
                case 3:
                    r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
                case 2:
                    r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
                default:
                    break;
            }

            TRANSPOSE_8x8;

            _mm256_storeu_ps(d + PACK_UNIT * 7, t7);
            _mm256_storeu_ps(d + PACK_UNIT * 6, t6);
            _mm256_storeu_ps(d + PACK_UNIT * 5, t5);
            _mm256_storeu_ps(d + PACK_UNIT * 4, t4);
            _mm256_storeu_ps(d + PACK_UNIT * 3, t3);
            _mm256_storeu_ps(d + PACK_UNIT * 2, t2);
            _mm256_storeu_ps(d + PACK_UNIT * 1, t1);
            _mm256_storeu_ps(d + PACK_UNIT * 0, t0);
        }
    }
    for (int x = areaRemain; x < area; ++x) {
        for (int y = 0; y < remain; y++) {
            dstPlane[PACK_UNIT * x + y] = srcPlane[y * srcAreaOffset + x];
        }
        for (int y = remain; y < PACK_UNIT; y++) {
            dstPlane[PACK_UNIT * x + y] = 0;
        }
    }
}
// Right
for (int z = 0; z < depthC4; ++z) {
    float* dstPlane       = z * dstAreaOffset * PACK_UNIT + dst;
    const float* srcPlane = src + z * srcAreaOffset * PACK_UNIT;
    for (int x = areaRemain; x < area; ++x) {
        float s0 = srcPlane[x];
        float s1 = srcPlane[x + srcAreaOffset];
        float s2 = srcPlane[x + srcAreaOffset * 2];
        float s3 = srcPlane[x + srcAreaOffset * 3];
        float s4 = srcPlane[x + srcAreaOffset * 4];
        float s5 = srcPlane[x + srcAreaOffset * 5];
        float s6 = srcPlane[x + srcAreaOffset * 6];
        float s7 = srcPlane[x + srcAreaOffset * 7];
        _mm256_storeu_ps(dstPlane + PACK_UNIT * x, _mm256_set_ps(s7, s6, s5, s4, s3, s2, s1, s0));
    }
}

}

from mnn.

github-actions avatar github-actions commented on June 2, 2024

Marking as stale. No activity in 60 days.

from mnn.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.