Comments (2)
补充,该函数的整体代码,void _AVX_MNNPackCUnit(float* dst, const float* src, size_t area, size_t depth, int* areaOffset) {
auto areaC4 = area / PACK_UNIT;
auto depthC4 = depth / PACK_UNIT;
auto srcAreaOffset = areaOffset[0];
auto dstAreaOffset = areaOffset[1];
__m256 t0, t1, t2, t3, t4, t5, t6, t7;
for (int z = 0; z < depthC4; ++z) {
auto dstPlane = dst + z * dstAreaOffset * PACK_UNIT;
auto srcPlane = src + z * srcAreaOffset * PACK_UNIT;
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + PACK_UNIT * x;
auto d = dstPlane + PACK_UNIT * PACK_UNIT * x;
auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
auto r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
auto r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
auto r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
auto r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
auto r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
auto r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
auto r7 = _mm256_loadu_ps(s + 7 * srcAreaOffset);
TRANSPOSE_8x8;
_mm256_storeu_ps(d + PACK_UNIT * 0, t0);
_mm256_storeu_ps(d + PACK_UNIT * 1, t1);
_mm256_storeu_ps(d + PACK_UNIT * 2, t2);
_mm256_storeu_ps(d + PACK_UNIT * 3, t3);
_mm256_storeu_ps(d + PACK_UNIT * 4, t4);
_mm256_storeu_ps(d + PACK_UNIT * 5, t5);
_mm256_storeu_ps(d + PACK_UNIT * 6, t6);
_mm256_storeu_ps(d + PACK_UNIT * 7, t7);
}
}
auto areaRemain = areaC4 * PACK_UNIT;
auto depthRemain = depthC4 * PACK_UNIT;
// Down
int remain = depth - depthRemain;
if (remain > 0) {
float* dstPlane = depthC4 * dstAreaOffset * PACK_UNIT + dst;
const float* srcPlane = src + depthC4 * srcAreaOffset * PACK_UNIT;
{
for (int x = 0; x < areaC4; ++x) {
auto s = srcPlane + PACK_UNIT * x;
auto d = dstPlane + PACK_UNIT * PACK_UNIT * x;
auto r0 = _mm256_loadu_ps(s + 0 * srcAreaOffset);
auto r1 = _mm256_setzero_ps();
auto r2 = _mm256_setzero_ps();
auto r3 = _mm256_setzero_ps();
auto r4 = _mm256_setzero_ps();
auto r5 = _mm256_setzero_ps();
auto r6 = _mm256_setzero_ps();
auto r7 = _mm256_setzero_ps();
switch (remain) {
case 7:
r6 = _mm256_loadu_ps(s + 6 * srcAreaOffset);
case 6:
r5 = _mm256_loadu_ps(s + 5 * srcAreaOffset);
case 5:
r4 = _mm256_loadu_ps(s + 4 * srcAreaOffset);
case 4:
r3 = _mm256_loadu_ps(s + 3 * srcAreaOffset);
case 3:
r2 = _mm256_loadu_ps(s + 2 * srcAreaOffset);
case 2:
r1 = _mm256_loadu_ps(s + 1 * srcAreaOffset);
default:
break;
}
TRANSPOSE_8x8;
_mm256_storeu_ps(d + PACK_UNIT * 7, t7);
_mm256_storeu_ps(d + PACK_UNIT * 6, t6);
_mm256_storeu_ps(d + PACK_UNIT * 5, t5);
_mm256_storeu_ps(d + PACK_UNIT * 4, t4);
_mm256_storeu_ps(d + PACK_UNIT * 3, t3);
_mm256_storeu_ps(d + PACK_UNIT * 2, t2);
_mm256_storeu_ps(d + PACK_UNIT * 1, t1);
_mm256_storeu_ps(d + PACK_UNIT * 0, t0);
}
}
for (int x = areaRemain; x < area; ++x) {
for (int y = 0; y < remain; y++) {
dstPlane[PACK_UNIT * x + y] = srcPlane[y * srcAreaOffset + x];
}
for (int y = remain; y < PACK_UNIT; y++) {
dstPlane[PACK_UNIT * x + y] = 0;
}
}
}
// Right
for (int z = 0; z < depthC4; ++z) {
float* dstPlane = z * dstAreaOffset * PACK_UNIT + dst;
const float* srcPlane = src + z * srcAreaOffset * PACK_UNIT;
for (int x = areaRemain; x < area; ++x) {
float s0 = srcPlane[x];
float s1 = srcPlane[x + srcAreaOffset];
float s2 = srcPlane[x + srcAreaOffset * 2];
float s3 = srcPlane[x + srcAreaOffset * 3];
float s4 = srcPlane[x + srcAreaOffset * 4];
float s5 = srcPlane[x + srcAreaOffset * 5];
float s6 = srcPlane[x + srcAreaOffset * 6];
float s7 = srcPlane[x + srcAreaOffset * 7];
_mm256_storeu_ps(dstPlane + PACK_UNIT * x, _mm256_set_ps(s7, s6, s5, s4, s3, s2, s1, s0));
}
}
}
from mnn.
Marking as stale. No activity in 60 days.
from mnn.
Related Issues (20)
- MNN内存泄漏检查 HOT 2
- MNN Llama-3-8B-Instruct export 失败 HOT 1
- Cut MNN build size according to operators to use HOT 2
- 添加完Mutlihead attention 算子后,报错Reshape41 算子异常 HOT 1
- meta-llama3-8b-instruct 使用llm_demo 推理报错 HOT 2
- onnx转mnn之后模型推理和原模型差距太大 HOT 3
- phi2 使用 llm_demo 推理报错 HOT 2
- 有转换mobilenet V3成功的老铁吗?转为mnn之后准确率下降很多 HOT 1
- 量化后输出怎么从float改成int8
- 求助!!!!!!!!打印 input_tensor内容全部都是0 HOT 1
- fastOnnxTest 成功 但使用时输出不一致 HOT 2
- 编译示例工程代码报错
- 编译测试工具报错阻塞 HOT 3
- 问一下, 在armv8.2 cpu 手机上, 是否已支持fp16 进行卷积的推理阿? HOT 1
- c++ 链接MNN后,编译报错,无法解析的外部符号 _std_init_once_link_alternate_names_and_abort和_std_min_element_4 HOT 5
- mnncompress量化模型以后大小没有变化 HOT 3
- MNN.cv has the rotation issue in loading some photos from the phone HOT 1
- 求教 expr 中_Fill 的正确用法 HOT 3
- 鸿蒙系统编译失败 HOT 1
- Facing issue with version incapitability between protobuffer library and MNN library while performing mnnquant quantization process using python API of mnn HOT 4
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from mnn.