From 756f5aa23d0bd870817f33a0a1d5eeb8c09e7dde Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Wed, 24 Jun 2020 09:11:02 +0000 Subject: [PATCH 1/4] Restyled by clang-format --- src/layer/x86/avx_usability.h | 3 +- src/layer/x86/cast_x86.cpp | 8 +- src/layer/x86/convolution_1x1_pack8.h | 174 ++-- src/layer/x86/convolution_1x1_pack8_fp16.h | 178 ++-- src/layer/x86/convolution_2x2_pack8.h | 2 +- src/layer/x86/convolution_2x2_pack8_fp16.h | 4 +- src/layer/x86/convolution_3x3_pack8.h | 14 +- src/layer/x86/convolution_3x3_pack8_fp16.h | 16 +- src/layer/x86/convolution_x86.cpp | 86 +- .../x86/convolutiondepthwise_3x3_pack8_fp16.h | 6 +- src/layer/x86/convolutiondepthwise_x86.cpp | 25 +- src/layer/x86/innerproduct_x86.cpp | 863 +++++++++--------- src/layer/x86/innerproduct_x86.h | 14 +- tests/testutil.h | 1 - 14 files changed, 708 insertions(+), 686 deletions(-) mode change 100755 => 100644 src/layer/x86/innerproduct_x86.cpp diff --git a/src/layer/x86/avx_usability.h b/src/layer/x86/avx_usability.h index c35b84dcd559..e05b7554253c 100644 --- a/src/layer/x86/avx_usability.h +++ b/src/layer/x86/avx_usability.h @@ -18,7 +18,8 @@ #define AVX_USABILITY #include -static inline __m256 loadfp16(const unsigned short* ptr) { +static inline __m256 loadfp16(const unsigned short* ptr) +{ return _mm256_cvtph_ps(_mm_load_si128((__m128i*)(ptr))); } static inline __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c) diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp index efb9df78758f..6c3ba6ad6103 100644 --- a/src/layer/x86/cast_x86.cpp +++ b/src/layer/x86/cast_x86.cpp @@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); @@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 4 && type_to == 1) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 1 && type_to == 4) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h index 6665741faa9e..e75a3c107e3d 100644 --- a/src/layer/x86/convolution_1x1_pack8.h +++ b/src/layer/x86/convolution_1x1_pack8.h @@ -14,7 +14,6 @@ static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output) { - // src = kw-kh-inch-outch // dst = 8b-8a-kw-kh-inch/8a-outch/8b Mat weight_data_r2 = kernel.reshape(1, num_input, num_output); @@ -109,90 +108,89 @@ static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& w const float* k77 = k7.row(p + 7); float* g00 = g0.row(p / 8); - g00[0] = k00[0]; - g00[1] = k10[0]; - g00[2] = k20[0]; - g00[3] = k30[0]; - g00[4] = k40[0]; - g00[5] = k50[0]; - g00[6] = k60[0]; - g00[7] = k70[0]; - g00 += 8; - g00[0] = k01[0]; - g00[1] = k11[0]; - g00[2] = k21[0]; - g00[3] = k31[0]; - g00[4] = k41[0]; - g00[5] = k51[0]; - g00[6] = k61[0]; - g00[7] = k71[0]; - - g00 += 8; - g00[0] = k02[0]; - g00[1] = k12[0]; - g00[2] = k22[0]; - g00[3] = k32[0]; - g00[4] = k42[0]; - g00[5] = k52[0]; - g00[6] = k62[0]; - g00[7] = k72[0]; - - g00 += 8; - g00[0] = k03[0]; - g00[1] = k13[0]; - g00[2] = k23[0]; - g00[3] = k33[0]; - g00[4] = k43[0]; - g00[5] = k53[0]; - g00[6] = k63[0]; - g00[7] = k73[0]; - - g00 += 8; - g00[0] = k04[0]; - g00[1] = k14[0]; - g00[2] = k24[0]; - g00[3] = k34[0]; - g00[4] = k44[0]; - g00[5] = k54[0]; - g00[6] = k64[0]; - g00[7] = k74[0]; - - g00 += 8; - g00[0] = k05[0]; - g00[1] = k15[0]; - g00[2] = k25[0]; - g00[3] = k35[0]; - g00[4] = k45[0]; - g00[5] = k55[0]; - g00[6] = k65[0]; - g00[7] = k75[0]; - - g00 += 8; - g00[0] = k06[0]; - g00[1] = k16[0]; - g00[2] = k26[0]; - g00[3] = k36[0]; - g00[4] = k46[0]; - g00[5] = k56[0]; - g00[6] = k66[0]; - g00[7] = k76[0]; - - g00 += 8; - g00[0] = k07[0]; - g00[1] = k17[0]; - g00[2] = k27[0]; - g00[3] = k37[0]; - g00[4] = k47[0]; - g00[5] = k57[0]; - g00[6] = k67[0]; - g00[7] = k77[0]; - - g00 += 8; + g00[0] = k00[0]; + g00[1] = k10[0]; + g00[2] = k20[0]; + g00[3] = k30[0]; + g00[4] = k40[0]; + g00[5] = k50[0]; + g00[6] = k60[0]; + g00[7] = k70[0]; + g00 += 8; + g00[0] = k01[0]; + g00[1] = k11[0]; + g00[2] = k21[0]; + g00[3] = k31[0]; + g00[4] = k41[0]; + g00[5] = k51[0]; + g00[6] = k61[0]; + g00[7] = k71[0]; + + g00 += 8; + g00[0] = k02[0]; + g00[1] = k12[0]; + g00[2] = k22[0]; + g00[3] = k32[0]; + g00[4] = k42[0]; + g00[5] = k52[0]; + g00[6] = k62[0]; + g00[7] = k72[0]; + + g00 += 8; + g00[0] = k03[0]; + g00[1] = k13[0]; + g00[2] = k23[0]; + g00[3] = k33[0]; + g00[4] = k43[0]; + g00[5] = k53[0]; + g00[6] = k63[0]; + g00[7] = k73[0]; + + g00 += 8; + g00[0] = k04[0]; + g00[1] = k14[0]; + g00[2] = k24[0]; + g00[3] = k34[0]; + g00[4] = k44[0]; + g00[5] = k54[0]; + g00[6] = k64[0]; + g00[7] = k74[0]; + + g00 += 8; + g00[0] = k05[0]; + g00[1] = k15[0]; + g00[2] = k25[0]; + g00[3] = k35[0]; + g00[4] = k45[0]; + g00[5] = k55[0]; + g00[6] = k65[0]; + g00[7] = k75[0]; + + g00 += 8; + g00[0] = k06[0]; + g00[1] = k16[0]; + g00[2] = k26[0]; + g00[3] = k36[0]; + g00[4] = k46[0]; + g00[5] = k56[0]; + g00[6] = k66[0]; + g00[7] = k76[0]; + + g00 += 8; + g00[0] = k07[0]; + g00[1] = k17[0]; + g00[2] = k27[0]; + g00[3] = k37[0]; + g00[4] = k47[0]; + g00[5] = k57[0]; + g00[6] = k67[0]; + g00[7] = k77[0]; + + g00 += 8; } } } - static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; @@ -212,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con { int nn_size = size / 12; int remain_size_start = nn_size * 12; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -253,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } nn_size = (size - remain_size_start) >> 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -290,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -317,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -339,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } remain_size_start += nn_size << 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -355,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1016,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h index 222ffff8f3e9..5642e4a210d2 100644 --- a/src/layer/x86/convolution_1x1_pack8_fp16.h +++ b/src/layer/x86/convolution_1x1_pack8_fp16.h @@ -13,7 +13,7 @@ // specific language governing permissions and limitations under the License. static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output) { - // src = kw-kh-inch-outch + // src = kw-kh-inch-outch // dst = 8b-8a-kw-kh-inch/8a-outch/8b Mat weight_data_r2 = kernel.reshape(1, num_input, num_output); @@ -106,92 +106,90 @@ static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, M const float* k76 = k7.row(p + 6); const float* k77 = k7.row(p + 7); - unsigned short* g00 =(unsigned short*) g0.row(p / 8); - g00[0] = float32_to_float16(k00[0]); - g00[1] = float32_to_float16(k10[0]); - g00[2] = float32_to_float16(k20[0]); - g00[3] = float32_to_float16(k30[0]); - g00[4] = float32_to_float16(k40[0]); - g00[5] = float32_to_float16(k50[0]); - g00[6] = float32_to_float16(k60[0]); - g00[7] = float32_to_float16(k70[0]); - g00 += 8; - g00[0] = float32_to_float16(k01[0]); - g00[1] = float32_to_float16(k11[0]); - g00[2] = float32_to_float16(k21[0]); - g00[3] = float32_to_float16(k31[0]); - g00[4] = float32_to_float16(k41[0]); - g00[5] = float32_to_float16(k51[0]); - g00[6] = float32_to_float16(k61[0]); - g00[7] = float32_to_float16(k71[0]); - - g00 += 8; - g00[0] = float32_to_float16(k02[0]); - g00[1] = float32_to_float16(k12[0]); - g00[2] = float32_to_float16(k22[0]); - g00[3] = float32_to_float16(k32[0]); - g00[4] = float32_to_float16(k42[0]); - g00[5] = float32_to_float16(k52[0]); - g00[6] = float32_to_float16(k62[0]); - g00[7] = float32_to_float16(k72[0]); - - g00 += 8; - g00[0] = float32_to_float16(k03[0]); - g00[1] = float32_to_float16(k13[0]); - g00[2] = float32_to_float16(k23[0]); - g00[3] = float32_to_float16(k33[0]); - g00[4] = float32_to_float16(k43[0]); - g00[5] = float32_to_float16(k53[0]); - g00[6] = float32_to_float16(k63[0]); - g00[7] = float32_to_float16(k73[0]); - - g00 += 8; - g00[0] = float32_to_float16(k04[0]); - g00[1] = float32_to_float16(k14[0]); - g00[2] = float32_to_float16(k24[0]); - g00[3] = float32_to_float16(k34[0]); - g00[4] = float32_to_float16(k44[0]); - g00[5] = float32_to_float16(k54[0]); - g00[6] = float32_to_float16(k64[0]); - g00[7] = float32_to_float16(k74[0]); - - g00 += 8; - g00[0] = float32_to_float16(k05[0]); - g00[1] = float32_to_float16(k15[0]); - g00[2] = float32_to_float16(k25[0]); - g00[3] = float32_to_float16(k35[0]); - g00[4] = float32_to_float16(k45[0]); - g00[5] = float32_to_float16(k55[0]); - g00[6] = float32_to_float16(k65[0]); - g00[7] = float32_to_float16(k75[0]); - - g00 += 8; - g00[0] = float32_to_float16(k06[0]); - g00[1] = float32_to_float16(k16[0]); - g00[2] = float32_to_float16(k26[0]); - g00[3] = float32_to_float16(k36[0]); - g00[4] = float32_to_float16(k46[0]); - g00[5] = float32_to_float16(k56[0]); - g00[6] = float32_to_float16(k66[0]); - g00[7] = float32_to_float16(k76[0]); - - g00 += 8; - g00[0] = float32_to_float16(k07[0]); - g00[1] = float32_to_float16(k17[0]); - g00[2] = float32_to_float16(k27[0]); - g00[3] = float32_to_float16(k37[0]); - g00[4] = float32_to_float16(k47[0]); - g00[5] = float32_to_float16(k57[0]); - g00[6] = float32_to_float16(k67[0]); - g00[7] = float32_to_float16(k77[0]); - - g00 += 8; + unsigned short* g00 = (unsigned short*)g0.row(p / 8); + g00[0] = float32_to_float16(k00[0]); + g00[1] = float32_to_float16(k10[0]); + g00[2] = float32_to_float16(k20[0]); + g00[3] = float32_to_float16(k30[0]); + g00[4] = float32_to_float16(k40[0]); + g00[5] = float32_to_float16(k50[0]); + g00[6] = float32_to_float16(k60[0]); + g00[7] = float32_to_float16(k70[0]); + g00 += 8; + g00[0] = float32_to_float16(k01[0]); + g00[1] = float32_to_float16(k11[0]); + g00[2] = float32_to_float16(k21[0]); + g00[3] = float32_to_float16(k31[0]); + g00[4] = float32_to_float16(k41[0]); + g00[5] = float32_to_float16(k51[0]); + g00[6] = float32_to_float16(k61[0]); + g00[7] = float32_to_float16(k71[0]); + + g00 += 8; + g00[0] = float32_to_float16(k02[0]); + g00[1] = float32_to_float16(k12[0]); + g00[2] = float32_to_float16(k22[0]); + g00[3] = float32_to_float16(k32[0]); + g00[4] = float32_to_float16(k42[0]); + g00[5] = float32_to_float16(k52[0]); + g00[6] = float32_to_float16(k62[0]); + g00[7] = float32_to_float16(k72[0]); + + g00 += 8; + g00[0] = float32_to_float16(k03[0]); + g00[1] = float32_to_float16(k13[0]); + g00[2] = float32_to_float16(k23[0]); + g00[3] = float32_to_float16(k33[0]); + g00[4] = float32_to_float16(k43[0]); + g00[5] = float32_to_float16(k53[0]); + g00[6] = float32_to_float16(k63[0]); + g00[7] = float32_to_float16(k73[0]); + + g00 += 8; + g00[0] = float32_to_float16(k04[0]); + g00[1] = float32_to_float16(k14[0]); + g00[2] = float32_to_float16(k24[0]); + g00[3] = float32_to_float16(k34[0]); + g00[4] = float32_to_float16(k44[0]); + g00[5] = float32_to_float16(k54[0]); + g00[6] = float32_to_float16(k64[0]); + g00[7] = float32_to_float16(k74[0]); + + g00 += 8; + g00[0] = float32_to_float16(k05[0]); + g00[1] = float32_to_float16(k15[0]); + g00[2] = float32_to_float16(k25[0]); + g00[3] = float32_to_float16(k35[0]); + g00[4] = float32_to_float16(k45[0]); + g00[5] = float32_to_float16(k55[0]); + g00[6] = float32_to_float16(k65[0]); + g00[7] = float32_to_float16(k75[0]); + + g00 += 8; + g00[0] = float32_to_float16(k06[0]); + g00[1] = float32_to_float16(k16[0]); + g00[2] = float32_to_float16(k26[0]); + g00[3] = float32_to_float16(k36[0]); + g00[4] = float32_to_float16(k46[0]); + g00[5] = float32_to_float16(k56[0]); + g00[6] = float32_to_float16(k66[0]); + g00[7] = float32_to_float16(k76[0]); + + g00 += 8; + g00[0] = float32_to_float16(k07[0]); + g00[1] = float32_to_float16(k17[0]); + g00[2] = float32_to_float16(k27[0]); + g00[3] = float32_to_float16(k37[0]); + g00[4] = float32_to_float16(k47[0]); + g00[5] = float32_to_float16(k57[0]); + g00[6] = float32_to_float16(k67[0]); + g00[7] = float32_to_float16(k77[0]); + + g00 += 8; } } } - - static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { int w = bottom_blob.w; @@ -212,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob { int nn_size = size / 12; int remain_size_start = nn_size * 12; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -253,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } nn_size = (size - remain_size_start) >> 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -290,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -317,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -339,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } remain_size_start += nn_size << 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -355,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1016,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h index 79e6716acf46..0e59af34ec3e 100644 --- a/src/layer/x86/convolution_2x2_pack8.h +++ b/src/layer/x86/convolution_2x2_pack8.h @@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat int outch = top_blob.c; const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h index 0d87ef826363..168bc77da425 100644 --- a/src/layer/x86/convolution_2x2_pack8_fp16.h +++ b/src/layer/x86/convolution_2x2_pack8_fp16.h @@ -106,7 +106,7 @@ static void conv2x2s1_weight_fp16_pack8_avx(const Mat& kernel, Mat& kernel_tm_pa const float* k76 = k7.row(p + 6); const float* k77 = k7.row(p + 7); - unsigned short* g00 =(unsigned short*) g0.row(p / 8); + unsigned short* g00 = (unsigned short*)g0.row(p / 8); for (int k = 0; k < 4; k++) { @@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons int outch = top_blob.c; const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h index 2c9ab4ee3af9..a4bc42bda87e 100644 --- a/src/layer/x86/convolution_3x3_pack8.h +++ b/src/layer/x86/convolution_3x3_pack8.h @@ -12,7 +12,6 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch) { // winograd63 transform kernel @@ -27,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + {0.0f, 0.0f, 1.0f}}; - #pragma omp parallel for +#pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -300,7 +298,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -487,7 +485,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -626,7 +624,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1334,7 +1332,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h index a77d91b65f06..0bd0f5047588 100644 --- a/src/layer/x86/convolution_3x3_pack8_fp16.h +++ b/src/layer/x86/convolution_3x3_pack8_fp16.h @@ -12,7 +12,6 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. - static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch) { // winograd63 transform kernel @@ -27,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + {0.0f, 0.0f, 1.0f}}; - #pragma omp parallel for +#pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -300,7 +298,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -465,7 +463,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top } bottom_blob_bordered = Mat(); // END transform input - + // BEGIN dot Mat top_blob_tm; { @@ -487,7 +485,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -625,7 +623,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // permute end top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1333,7 +1331,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 3c5af335a9e6..95ff6f2c5071 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -191,24 +191,24 @@ int Convolution_x86::create_pipeline(const Option& opt) // pack8 if (elempack == 8 && out_elempack == 8) { - if (opt.use_fp16_storage &&kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output); } else if (opt.use_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output); - } else if (opt.use_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output); - - } else if (opt.use_fp16_storage && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + } + else if (opt.use_fp16_storage && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { conv2x2s1_weight_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output); - - } else { + } + else + { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { conv3x3s1_winograd64_transform_kernel_pack8_avx(weight_data, weight_data_pack8, num_input, num_output); @@ -604,12 +604,14 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option #if __AVX__ if (elempack == 8 && out_elempack == 8) { - if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { conv1x1s1_sgemm_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } else { + } + else + { conv1x1s1_sgemm_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); } @@ -620,12 +622,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { conv1x1s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - - } else { + } + else + { conv1x1s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } if (activation) { @@ -634,12 +637,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { conv1x1s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - - } else { + } + else + { conv1x1s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } if (activation) @@ -649,9 +653,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { conv3x3s1_winograd64_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } else { + } + else + { conv3x3s1_winograd64_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); } @@ -662,9 +669,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else if (kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { conv2x2s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } else { + } + else + { conv2x2s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); } @@ -675,8 +685,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -764,8 +774,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -822,8 +832,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -909,8 +919,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -991,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt) use_winograd3x3_int8 = false; if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 - && num_input >= 16 && num_output >= 16) + && num_input >= 16 && num_output >= 16) { // winograd is slow on small channel count use_winograd3x3_int8 = true; @@ -1059,10 +1069,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (use_winograd3x3_int8) { conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); - // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); +// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); - // requantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) +// requantize, reverse scale inplace +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1109,10 +1119,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (use_winograd3x3_int8) { conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); - // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); +// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); - // dequantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) +// dequantize, reverse scale inplace +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1196,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, if (inner_top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c++) { float* outptr = inner_bottom_blob.channel(c); @@ -1216,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, opt_g.blob_allocator = inner_top_blob.allocator; convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c++) { float* outptr = (float*)top_blob.channel(c) + x * outw + y; diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h index f4eeabb923ba..b23ea4b41ffe 100644 --- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h +++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h @@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); @@ -234,14 +234,14 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + g * 8) : _mm256_set1_ps(0.f); - const unsigned short* k0 =(const unsigned short* )kernel.row(g); + const unsigned short* k0 = (const unsigned short*)kernel.row(g); float* outptr0 = out.row(0); float* outptr1 = out.row(1); diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index d277aa5d1964..c9b624161895 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -118,7 +118,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) // pack8 if (elempack == 8) { - if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { Mat weight_data_r2 = weight_data.reshape(maxk, group); @@ -127,7 +126,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt) ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_pack8, opt); return 0; } - if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { Mat weight_data_r2 = weight_data.reshape(maxk, group); Mat weight_data_tmp; @@ -289,9 +288,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } else { + } + else + { convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); } @@ -304,9 +306,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { - if (opt.use_fp16_storage) { + if (opt.use_fp16_storage) + { convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); - } else { + } + else + { convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt); } @@ -362,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -499,8 +504,8 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; - // quantize, scale and round to nearest - #pragma omp parallel for num_threads(opt.num_threads) +// quantize, scale and round to nearest +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Option opt_g = opt; @@ -614,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; const int num_output_g = num_output / group; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g); diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp old mode 100755 new mode 100644 index 416365eb1046..0135fb26f157 --- a/src/layer/x86/innerproduct_x86.cpp +++ b/src/layer/x86/innerproduct_x86.cpp @@ -1,425 +1,438 @@ -// Tencent is pleased to support the open source community by making ncnn -// available. -// -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of the -// License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations under -// the License. -#include - -#ifdef __AVX__ -#include "avx_activation.h" -#include "avx_usability.h" -#endif // NCNN_AVX2 - -#include "innerproduct_x86.h" - -#include "layer_type.h" - - -namespace ncnn { - -DEFINE_LAYER_CREATOR(InnerProduct_x86) - - -InnerProduct_x86::InnerProduct_x86() -{ -#if __AVX__ - support_packing = true; -#endif // __AVX__ - - flatten = 0; -} - -int InnerProduct_x86::create_pipeline(const Option& opt) -{ -#if __AVX__ - if (opt.use_packing_layout) - { - flatten = ncnn::create_layer(ncnn::LayerType::Flatten); - - ncnn::ParamDict pd; - - flatten->load_param(pd); - - flatten->create_pipeline(opt); - } - if (opt.use_fp16_storage && weight_data.elemsize == 4u) - { - ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt); - } -#endif // __AVX__ - - - return 0; -} - -int InnerProduct_x86::destroy_pipeline(const Option& opt) -{ - if (flatten) - { - flatten->destroy_pipeline(opt); - delete flatten; - flatten = 0; - } - - return 0; -} - -int InnerProduct_x86::forward(const Mat &bottom_blob, Mat &top_blob, - const Option &opt) const { - if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) - { - // TODO - return InnerProduct::forward(bottom_blob, top_blob, opt); - } - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - int size = w * h; - // fprintf(stderr, "bottom_blob %d x %d x %d, elempack = %d \n", w,h,channels,elempack); -#if __AVX__ - if (elempack == 8) - { - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; - - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } - - // pack1 - { - bottom_blob_flattened.w *= bottom_blob_flattened.elempack; - bottom_blob_flattened.cstep = bottom_blob_flattened.w; - bottom_blob_flattened.elemsize = 4u; - bottom_blob_flattened.elempack = 1; - } - if ( opt.use_fp16_storage) { - return forward_fp16(bottom_blob_flattened, top_blob, opt); - } else { - return forward(bottom_blob_flattened, top_blob, opt); - } - } - - if (size % 8 == 0 && opt.use_fp16_storage) { - return forward_fp16(bottom_blob, top_blob, opt); - } - top_blob.create(num_output, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - - const float *weight_data_ptr = weight_data; - - int nn_num_output = num_output >> 3; - int remain_num_output_start = nn_num_output << 3; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) { - int p = pp * 8; - - float sums[8] = {0.0f}; - if (bias_term) { - sums[0] = bias_data[p]; - sums[1] = bias_data[p + 1]; - sums[2] = bias_data[p + 2]; - sums[3] = bias_data[p + 3]; - sums[4] = bias_data[p + 4]; - sums[5] = bias_data[p + 5]; - sums[6] = bias_data[p + 6]; - sums[7] = bias_data[p + 7]; - } - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - - const float *w0 = weight_data_ptr + size * channels * p; - const float *w1 = weight_data_ptr + size * channels * (p + 1); - const float *w2 = weight_data_ptr + size * channels * (p + 2); - const float *w3 = weight_data_ptr + size * channels * (p + 3); - const float *w4 = weight_data_ptr + size * channels * (p + 4); - const float *w5 = weight_data_ptr + size * channels * (p + 5); - const float *w6 = weight_data_ptr + size * channels * (p + 6); - const float *w7 = weight_data_ptr + size * channels * (p + 7); - - - // channels - for (int q = 0; q < channels; q++) { - const float *m = bottom_blob.channel(q); - int nn = size >> 3; - int remain = size & 7; - - for (; nn > 0; nn--) { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = _mm256_loadu_ps(w0); - _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0); - - __m256 _w1 = _mm256_loadu_ps(w1); - _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1); - - __m256 _w2 = _mm256_loadu_ps(w2); - _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2); - - __m256 _w3 = _mm256_loadu_ps(w3); - _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); - - __m256 _w4 = _mm256_loadu_ps(w4); - _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4); - - __m256 _w5 = _mm256_loadu_ps(w5); - _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5); - - __m256 _w6 = _mm256_loadu_ps(w6); - _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6); - - __m256 _w7 = _mm256_loadu_ps(w7); - _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - w4 += 8; - w5 += 8; - w6 += 8; - w7 += 8; - } - - - for (; remain > 0; remain--) { - sums[0] += *m * *w0; - sums[1] += *m * *w1; - sums[2] += *m * *w2; - sums[3] += *m * *w3; - sums[4] += *m * *w4; - sums[5] += *m * *w5; - sums[6] += *m * *w6; - sums[7] += *m * *w7; - - m++; - w0++; - w1++; - w2++; - w3++; - w4++; - w5++; - w6++; - w7++; - } - __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, - _sum6, _sum7); - __m256 _sums_f = _mm256_loadu_ps(&sums[0]); - _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type, - activation_params); - _mm256_storeu_ps(&top_blob[p], _sums); - } - } - -// num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = remain_num_output_start; p < num_output; p++) { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const float *w = weight_data_ptr + size * channels * p; - - __m256 _sum = _mm256_set1_ps(0.f); - // channels - for (int q = 0; q < channels; q++) { - const float *m = bottom_blob.channel(q); - - int nn = size >> 3; - int remain = size & 7; - for (; nn > 0; nn--) { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w = _mm256_loadu_ps(w); - _sum = _mm256_fmadd_ps(_m, _w, _sum); - - m += 8; - w += 8; - } - for (; remain > 0; remain--) { - sum += *m * *w; - m++; - w++; - } - } - - sum += _mm256_reduce_add_ps(_sum); - sum = activation_ss(sum, activation_type, activation_params); - - top_blob[p] = sum; - } - return 0; -#else - return InnerProduct::forward(bottom_blob, top_blob, opt); -#endif // __AVX__ -} -#if __AVX__ - -int InnerProduct_x86::forward_fp16(const Mat &bottom_blob, Mat &top_blob, - const Option &opt) const { - - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - int size = w * h; - - top_blob.create(num_output, elemsize, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const unsigned short *weight_data_ptr = (const unsigned short *)weight_data_fp16; - - int nn_num_output = num_output >> 3; - int remain_num_output_start = nn_num_output << 3; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) { - int p = pp*8; - float sums[8] = {0.0f}; - if (bias_term) { - sums[0] = bias_data[p]; - sums[1] = bias_data[p + 1]; - sums[2] = bias_data[p + 2]; - sums[3] = bias_data[p + 3]; - sums[4] = bias_data[p + 4]; - sums[5] = bias_data[p + 5]; - sums[6] = bias_data[p + 6]; - sums[7] = bias_data[p + 7]; - } - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - - const unsigned short *w0 = weight_data_ptr + size * channels * p; - const unsigned short *w1 = weight_data_ptr + size * channels * (p + 1); - const unsigned short *w2 = weight_data_ptr + size * channels * (p + 2); - const unsigned short *w3 = weight_data_ptr + size * channels * (p + 3); - const unsigned short *w4 = weight_data_ptr + size * channels * (p + 4); - const unsigned short *w5 = weight_data_ptr + size * channels * (p + 5); - const unsigned short *w6 = weight_data_ptr + size * channels * (p + 6); - const unsigned short *w7 = weight_data_ptr + size * channels * (p + 7); - - - // channels - for (int q = 0; q < channels; q++) { - const float *m = bottom_blob.channel(q); - int nn = size >> 3; - for (; nn > 0; nn--) { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = loadfp16(w0); - _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0); - - __m256 _w1 = loadfp16(w1); - _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1); - - __m256 _w2 = loadfp16(w2); - _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2); - - __m256 _w3 = loadfp16(w3); - _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); - - __m256 _w4 = loadfp16(w4); - _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4); - - __m256 _w5 = loadfp16(w5); - _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5); - - __m256 _w6 = loadfp16(w6); - _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6); - - __m256 _w7 = loadfp16(w7); - _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - w4 += 8; - w5 += 8; - w6 += 8; - w7 += 8; - } - __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, - _sum6, _sum7); - __m256 _sums_f = _mm256_loadu_ps(&sums[0]); - _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type, - activation_params); - _mm256_storeu_ps(&top_blob[p], _sums); - } - } - -// num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = remain_num_output_start; p < num_output; p++) { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const unsigned short *w = weight_data_ptr + size * channels * p; - - __m256 _sum = _mm256_set1_ps(0.f); - // channels - for (int q = 0; q < channels; q++) { - const float *m = bottom_blob.channel(q); - - int nn = size >> 3; - for (; nn > 0; nn--) { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w = loadfp16(w); - _sum = _mm256_fmadd_ps(_m, _w, _sum); - - m += 8; - w += 8; - } - } - - sum += _mm256_reduce_add_ps(_sum); - sum = activation_ss(sum, activation_type, activation_params); - - top_blob[p] = sum; - } - return 0; -} -#endif // __ARM_NEON - -} // namespace ncnn +// Tencent is pleased to support the open source community by making ncnn +// available. +// +// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this +// file except in compliance with the License. You may obtain a copy of the +// License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. +#include + +#ifdef __AVX__ +#include "avx_activation.h" +#include "avx_usability.h" +#endif // NCNN_AVX2 + +#include "innerproduct_x86.h" + +#include "layer_type.h" + +namespace ncnn { + +DEFINE_LAYER_CREATOR(InnerProduct_x86) + +InnerProduct_x86::InnerProduct_x86() +{ +#if __AVX__ + support_packing = true; +#endif // __AVX__ + + flatten = 0; +} + +int InnerProduct_x86::create_pipeline(const Option& opt) +{ +#if __AVX__ + if (opt.use_packing_layout) + { + flatten = ncnn::create_layer(ncnn::LayerType::Flatten); + + ncnn::ParamDict pd; + + flatten->load_param(pd); + + flatten->create_pipeline(opt); + } + if (opt.use_fp16_storage && weight_data.elemsize == 4u) + { + ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt); + } +#endif // __AVX__ + + return 0; +} + +int InnerProduct_x86::destroy_pipeline(const Option& opt) +{ + if (flatten) + { + flatten->destroy_pipeline(opt); + delete flatten; + flatten = 0; + } + + return 0; +} + +int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, + const Option& opt) const +{ + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + // TODO + return InnerProduct::forward(bottom_blob, top_blob, opt); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h; + // fprintf(stderr, "bottom_blob %d x %d x %d, elempack = %d \n", w,h,channels,elempack); +#if __AVX__ + if (elempack == 8) + { + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + // pack1 + { + bottom_blob_flattened.w *= bottom_blob_flattened.elempack; + bottom_blob_flattened.cstep = bottom_blob_flattened.w; + bottom_blob_flattened.elemsize = 4u; + bottom_blob_flattened.elempack = 1; + } + if (opt.use_fp16_storage) + { + return forward_fp16(bottom_blob_flattened, top_blob, opt); + } + else + { + return forward(bottom_blob_flattened, top_blob, opt); + } + } + + if (size % 8 == 0 && opt.use_fp16_storage) + { + return forward_fp16(bottom_blob, top_blob, opt); + } + top_blob.create(num_output, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* weight_data_ptr = weight_data; + + int nn_num_output = num_output >> 3; + int remain_num_output_start = nn_num_output << 3; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 8; + + float sums[8] = {0.0f}; + if (bias_term) + { + sums[0] = bias_data[p]; + sums[1] = bias_data[p + 1]; + sums[2] = bias_data[p + 2]; + sums[3] = bias_data[p + 3]; + sums[4] = bias_data[p + 4]; + sums[5] = bias_data[p + 5]; + sums[6] = bias_data[p + 6]; + sums[7] = bias_data[p + 7]; + } + __m256 _sum0 = _mm256_set1_ps(0.f); + __m256 _sum1 = _mm256_set1_ps(0.f); + __m256 _sum2 = _mm256_set1_ps(0.f); + __m256 _sum3 = _mm256_set1_ps(0.f); + __m256 _sum4 = _mm256_set1_ps(0.f); + __m256 _sum5 = _mm256_set1_ps(0.f); + __m256 _sum6 = _mm256_set1_ps(0.f); + __m256 _sum7 = _mm256_set1_ps(0.f); + + const float* w0 = weight_data_ptr + size * channels * p; + const float* w1 = weight_data_ptr + size * channels * (p + 1); + const float* w2 = weight_data_ptr + size * channels * (p + 2); + const float* w3 = weight_data_ptr + size * channels * (p + 3); + const float* w4 = weight_data_ptr + size * channels * (p + 4); + const float* w5 = weight_data_ptr + size * channels * (p + 5); + const float* w6 = weight_data_ptr + size * channels * (p + 6); + const float* w7 = weight_data_ptr + size * channels * (p + 7); + + // channels + for (int q = 0; q < channels; q++) + { + const float* m = bottom_blob.channel(q); + int nn = size >> 3; + int remain = size & 7; + + for (; nn > 0; nn--) + { + __m256 _m = _mm256_loadu_ps(m); + + __m256 _w0 = _mm256_loadu_ps(w0); + _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0); + + __m256 _w1 = _mm256_loadu_ps(w1); + _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1); + + __m256 _w2 = _mm256_loadu_ps(w2); + _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2); + + __m256 _w3 = _mm256_loadu_ps(w3); + _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); + + __m256 _w4 = _mm256_loadu_ps(w4); + _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4); + + __m256 _w5 = _mm256_loadu_ps(w5); + _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5); + + __m256 _w6 = _mm256_loadu_ps(w6); + _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6); + + __m256 _w7 = _mm256_loadu_ps(w7); + _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7); + + m += 8; + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + } + + for (; remain > 0; remain--) + { + sums[0] += *m * *w0; + sums[1] += *m * *w1; + sums[2] += *m * *w2; + sums[3] += *m * *w3; + sums[4] += *m * *w4; + sums[5] += *m * *w5; + sums[6] += *m * *w6; + sums[7] += *m * *w7; + + m++; + w0++; + w1++; + w2++; + w3++; + w4++; + w5++; + w6++; + w7++; + } + __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, + _sum6, _sum7); + __m256 _sums_f = _mm256_loadu_ps(&sums[0]); + _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type, + activation_params); + _mm256_storeu_ps(&top_blob[p], _sums); + } + } + + // num_output +#pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const float* w = weight_data_ptr + size * channels * p; + + __m256 _sum = _mm256_set1_ps(0.f); + // channels + for (int q = 0; q < channels; q++) + { + const float* m = bottom_blob.channel(q); + + int nn = size >> 3; + int remain = size & 7; + for (; nn > 0; nn--) + { + __m256 _m = _mm256_loadu_ps(m); + + __m256 _w = _mm256_loadu_ps(w); + _sum = _mm256_fmadd_ps(_m, _w, _sum); + + m += 8; + w += 8; + } + for (; remain > 0; remain--) + { + sum += *m * *w; + m++; + w++; + } + } + + sum += _mm256_reduce_add_ps(_sum); + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + return 0; +#else + return InnerProduct::forward(bottom_blob, top_blob, opt); +#endif // __AVX__ +} +#if __AVX__ + +int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, + const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h; + + top_blob.create(num_output, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const unsigned short* weight_data_ptr = (const unsigned short*)weight_data_fp16; + + int nn_num_output = num_output >> 3; + int remain_num_output_start = nn_num_output << 3; + +#pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 8; + float sums[8] = {0.0f}; + if (bias_term) + { + sums[0] = bias_data[p]; + sums[1] = bias_data[p + 1]; + sums[2] = bias_data[p + 2]; + sums[3] = bias_data[p + 3]; + sums[4] = bias_data[p + 4]; + sums[5] = bias_data[p + 5]; + sums[6] = bias_data[p + 6]; + sums[7] = bias_data[p + 7]; + } + __m256 _sum0 = _mm256_set1_ps(0.f); + __m256 _sum1 = _mm256_set1_ps(0.f); + __m256 _sum2 = _mm256_set1_ps(0.f); + __m256 _sum3 = _mm256_set1_ps(0.f); + __m256 _sum4 = _mm256_set1_ps(0.f); + __m256 _sum5 = _mm256_set1_ps(0.f); + __m256 _sum6 = _mm256_set1_ps(0.f); + __m256 _sum7 = _mm256_set1_ps(0.f); + + const unsigned short* w0 = weight_data_ptr + size * channels * p; + const unsigned short* w1 = weight_data_ptr + size * channels * (p + 1); + const unsigned short* w2 = weight_data_ptr + size * channels * (p + 2); + const unsigned short* w3 = weight_data_ptr + size * channels * (p + 3); + const unsigned short* w4 = weight_data_ptr + size * channels * (p + 4); + const unsigned short* w5 = weight_data_ptr + size * channels * (p + 5); + const unsigned short* w6 = weight_data_ptr + size * channels * (p + 6); + const unsigned short* w7 = weight_data_ptr + size * channels * (p + 7); + + // channels + for (int q = 0; q < channels; q++) + { + const float* m = bottom_blob.channel(q); + int nn = size >> 3; + for (; nn > 0; nn--) + { + __m256 _m = _mm256_loadu_ps(m); + + __m256 _w0 = loadfp16(w0); + _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0); + + __m256 _w1 = loadfp16(w1); + _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1); + + __m256 _w2 = loadfp16(w2); + _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2); + + __m256 _w3 = loadfp16(w3); + _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); + + __m256 _w4 = loadfp16(w4); + _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4); + + __m256 _w5 = loadfp16(w5); + _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5); + + __m256 _w6 = loadfp16(w6); + _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6); + + __m256 _w7 = loadfp16(w7); + _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7); + + m += 8; + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + } + __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, + _sum6, _sum7); + __m256 _sums_f = _mm256_loadu_ps(&sums[0]); + _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type, + activation_params); + _mm256_storeu_ps(&top_blob[p], _sums); + } + } + + // num_output +#pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const unsigned short* w = weight_data_ptr + size * channels * p; + + __m256 _sum = _mm256_set1_ps(0.f); + // channels + for (int q = 0; q < channels; q++) + { + const float* m = bottom_blob.channel(q); + + int nn = size >> 3; + for (; nn > 0; nn--) + { + __m256 _m = _mm256_loadu_ps(m); + + __m256 _w = loadfp16(w); + _sum = _mm256_fmadd_ps(_m, _w, _sum); + + m += 8; + w += 8; + } + } + + sum += _mm256_reduce_add_ps(_sum); + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + return 0; +} +#endif // __ARM_NEON + +} // namespace ncnn diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h index dbaf299fb729..a08dd1f820b1 100644 --- a/src/layer/x86/innerproduct_x86.h +++ b/src/layer/x86/innerproduct_x86.h @@ -22,20 +22,22 @@ namespace ncnn { -class InnerProduct_x86 : virtual public InnerProduct { +class InnerProduct_x86 : virtual public InnerProduct +{ public: InnerProduct_x86(); - virtual int create_pipeline(const Option &opt); - virtual int destroy_pipeline(const Option &opt); + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, + const Option& opt) const; - virtual int forward(const Mat &bottom_blob, Mat &top_blob, - const Option &opt) const; protected: int forward_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; public: - ncnn::Layer *flatten; + ncnn::Layer* flatten; // fp16 weight data Mat weight_data_fp16; diff --git a/tests/testutil.h b/tests/testutil.h index 14c2083ae609..3703ecc3611f 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -770,7 +770,6 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec opts[2].use_bf16_storage = true; opts[2].use_shader_pack8 = true; opts[2].use_image_storage = true; - for (int i = 0; i < 3; i++) { From 6223e511c89625cfc940a959590a04e4efe8a5a1 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Wed, 24 Jun 2020 09:11:03 +0000 Subject: [PATCH 2/4] Restyled by astyle --- src/layer/x86/cast_x86.cpp | 8 ++++---- src/layer/x86/convolution_1x1_pack8.h | 14 +++++++------- src/layer/x86/convolution_1x1_pack8_fp16.h | 14 +++++++------- src/layer/x86/convolution_2x2_pack8.h | 2 +- src/layer/x86/convolution_2x2_pack8_fp16.h | 2 +- src/layer/x86/convolution_3x3_pack8.h | 13 +++++++------ src/layer/x86/convolution_3x3_pack8_fp16.h | 13 +++++++------ src/layer/x86/convolution_x86.cpp | 18 +++++++++--------- .../x86/convolutiondepthwise_3x3_pack8_fp16.h | 4 ++-- src/layer/x86/convolutiondepthwise_x86.cpp | 6 +++--- src/layer/x86/innerproduct_x86.cpp | 8 ++++---- 11 files changed, 52 insertions(+), 50 deletions(-) diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp index 6c3ba6ad6103..efb9df78758f 100644 --- a/src/layer/x86/cast_x86.cpp +++ b/src/layer/x86/cast_x86.cpp @@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); @@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 4 && type_to == 1) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 1 && type_to == 4) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h index e75a3c107e3d..d832d4b78d2e 100644 --- a/src/layer/x86/convolution_1x1_pack8.h +++ b/src/layer/x86/convolution_1x1_pack8.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con { int nn_size = size / 12; int remain_size_start = nn_size * 12; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } nn_size = (size - remain_size_start) >> 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } remain_size_start += nn_size << 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h index 5642e4a210d2..eb86507ff2fc 100644 --- a/src/layer/x86/convolution_1x1_pack8_fp16.h +++ b/src/layer/x86/convolution_1x1_pack8_fp16.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob { int nn_size = size / 12; int remain_size_start = nn_size * 12; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } nn_size = (size - remain_size_start) >> 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } remain_size_start += nn_size << 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h index 0e59af34ec3e..79e6716acf46 100644 --- a/src/layer/x86/convolution_2x2_pack8.h +++ b/src/layer/x86/convolution_2x2_pack8.h @@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat int outch = top_blob.c; const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h index 168bc77da425..68bbcfe02de7 100644 --- a/src/layer/x86/convolution_2x2_pack8_fp16.h +++ b/src/layer/x86/convolution_2x2_pack8_fp16.h @@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons int outch = top_blob.c; const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h index a4bc42bda87e..923779d11a0f 100644 --- a/src/layer/x86/convolution_3x3_pack8.h +++ b/src/layer/x86/convolution_3x3_pack8.h @@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; + {0.0f, 0.0f, 1.0f} + }; -#pragma omp parallel for + #pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -624,7 +625,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1332,7 +1333,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h index 0bd0f5047588..8d372d0c136b 100644 --- a/src/layer/x86/convolution_3x3_pack8_fp16.h +++ b/src/layer/x86/convolution_3x3_pack8_fp16.h @@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; + {0.0f, 0.0f, 1.0f} + }; -#pragma omp parallel for + #pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -623,7 +624,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // permute end top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1331,7 +1332,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 95ff6f2c5071..923b54d54964 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -686,7 +686,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -775,7 +775,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -833,7 +833,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -920,7 +920,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt) use_winograd3x3_int8 = false; if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 - && num_input >= 16 && num_output >= 16) + && num_input >= 16 && num_output >= 16) { // winograd is slow on small channel count use_winograd3x3_int8 = true; @@ -1072,7 +1072,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); // requantize, reverse scale inplace -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1122,7 +1122,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); // dequantize, reverse scale inplace -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, if (inner_top_blob.empty()) return -100; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c++) { float* outptr = inner_bottom_blob.channel(c); @@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, opt_g.blob_allocator = inner_top_blob.allocator; convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c++) { float* outptr = (float*)top_blob.channel(c) + x * outw + y; diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h index b23ea4b41ffe..41e181352b5f 100644 --- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h +++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h @@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); @@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index c9b624161895..ab9325d87cbf 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -505,7 +505,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; // quantize, scale and round to nearest -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Option opt_g = opt; @@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; const int num_output_g = num_output / group; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g); diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp index 0135fb26f157..80af85fa5889 100644 --- a/src/layer/x86/innerproduct_x86.cpp +++ b/src/layer/x86/innerproduct_x86.cpp @@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -243,7 +243,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, } // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f; @@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -397,7 +397,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, } // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f; From aec7dcdb88fecc9194ac75377741428a29ef2c44 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Wed, 24 Jun 2020 09:11:04 +0000 Subject: [PATCH 3/4] Restyled by clang-format --- src/layer/x86/cast_x86.cpp | 8 ++--- src/layer/x86/convolution_1x1_pack8.h | 14 ++++---- src/layer/x86/convolution_1x1_pack8_fp16.h | 14 ++++---- src/layer/x86/convolution_2x2_pack8.h | 2 +- src/layer/x86/convolution_2x2_pack8_fp16.h | 2 +- src/layer/x86/convolution_3x3_pack8.h | 13 ++++--- src/layer/x86/convolution_3x3_pack8_fp16.h | 13 ++++--- src/layer/x86/convolution_x86.cpp | 34 +++++++++---------- .../x86/convolutiondepthwise_3x3_pack8_fp16.h | 4 +-- src/layer/x86/convolutiondepthwise_x86.cpp | 8 ++--- src/layer/x86/innerproduct_x86.cpp | 12 +++---- 11 files changed, 61 insertions(+), 63 deletions(-) diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp index efb9df78758f..6c3ba6ad6103 100644 --- a/src/layer/x86/cast_x86.cpp +++ b/src/layer/x86/cast_x86.cpp @@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); @@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 4 && type_to == 1) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 1 && type_to == 4) { - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h index d832d4b78d2e..e75a3c107e3d 100644 --- a/src/layer/x86/convolution_1x1_pack8.h +++ b/src/layer/x86/convolution_1x1_pack8.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con { int nn_size = size / 12; int remain_size_start = nn_size * 12; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } nn_size = (size - remain_size_start) >> 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } remain_size_start += nn_size << 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h index eb86507ff2fc..5642e4a210d2 100644 --- a/src/layer/x86/convolution_1x1_pack8_fp16.h +++ b/src/layer/x86/convolution_1x1_pack8_fp16.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob { int nn_size = size / 12; int remain_size_start = nn_size * 12; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } nn_size = (size - remain_size_start) >> 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } remain_size_start += nn_size << 1; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h index 79e6716acf46..0e59af34ec3e 100644 --- a/src/layer/x86/convolution_2x2_pack8.h +++ b/src/layer/x86/convolution_2x2_pack8.h @@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat int outch = top_blob.c; const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h index 68bbcfe02de7..168bc77da425 100644 --- a/src/layer/x86/convolution_2x2_pack8_fp16.h +++ b/src/layer/x86/convolution_2x2_pack8_fp16.h @@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons int outch = top_blob.c; const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h index 923779d11a0f..a4bc42bda87e 100644 --- a/src/layer/x86/convolution_3x3_pack8.h +++ b/src/layer/x86/convolution_3x3_pack8.h @@ -26,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + {0.0f, 0.0f, 1.0f}}; - #pragma omp parallel for +#pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -299,7 +298,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -486,7 +485,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -625,7 +624,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1333,7 +1332,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h index 8d372d0c136b..0bd0f5047588 100644 --- a/src/layer/x86/convolution_3x3_pack8_fp16.h +++ b/src/layer/x86/convolution_3x3_pack8_fp16.h @@ -26,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f} - }; + {0.0f, 0.0f, 1.0f}}; - #pragma omp parallel for +#pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -299,7 +298,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -486,7 +485,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -624,7 +623,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // permute end top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1332,7 +1331,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 923b54d54964..c7c3d006a653 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -685,8 +685,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { -// num_output - #pragma omp parallel for num_threads(opt.num_threads) + // num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -774,8 +774,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { -// num_output - #pragma omp parallel for num_threads(opt.num_threads) + // num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -832,8 +832,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { -// num_output - #pragma omp parallel for num_threads(opt.num_threads) + // num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -919,8 +919,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option } else { -// num_output - #pragma omp parallel for num_threads(opt.num_threads) + // num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt) use_winograd3x3_int8 = false; if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 - && num_input >= 16 && num_output >= 16) + && num_input >= 16 && num_output >= 16) { // winograd is slow on small channel count use_winograd3x3_int8 = true; @@ -1069,10 +1069,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (use_winograd3x3_int8) { conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); -// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); + // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); -// requantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) + // requantize, reverse scale inplace +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1119,10 +1119,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con if (use_winograd3x3_int8) { conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); -// conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); + // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); -// dequantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) + // dequantize, reverse scale inplace +#pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, if (inner_top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c++) { float* outptr = inner_bottom_blob.channel(c); @@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, opt_g.blob_allocator = inner_top_blob.allocator; convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g); - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c++) { float* outptr = (float*)top_blob.channel(c) + x * outw + y; diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h index 41e181352b5f..b23ea4b41ffe 100644 --- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h +++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h @@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); @@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index ab9325d87cbf..03ffb1011f66 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -504,8 +504,8 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; -// quantize, scale and round to nearest - #pragma omp parallel for num_threads(opt.num_threads) + // quantize, scale and round to nearest +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Option opt_g = opt; @@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; const int num_output_g = num_output / group; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g); diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp index 80af85fa5889..1d7633a549b8 100644 --- a/src/layer/x86/innerproduct_x86.cpp +++ b/src/layer/x86/innerproduct_x86.cpp @@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -242,8 +242,8 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, } } - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f; @@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -396,8 +396,8 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, } } - // num_output - #pragma omp parallel for num_threads(opt.num_threads) +// num_output +#pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f; From 8831fe58254b29de1131e88c29826bb2615d0157 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Wed, 24 Jun 2020 09:11:05 +0000 Subject: [PATCH 4/4] Restyled by astyle --- src/layer/x86/cast_x86.cpp | 8 ++++---- src/layer/x86/convolution_1x1_pack8.h | 14 +++++++------- src/layer/x86/convolution_1x1_pack8_fp16.h | 14 +++++++------- src/layer/x86/convolution_2x2_pack8.h | 2 +- src/layer/x86/convolution_2x2_pack8_fp16.h | 2 +- src/layer/x86/convolution_3x3_pack8.h | 13 +++++++------ src/layer/x86/convolution_3x3_pack8_fp16.h | 13 +++++++------ src/layer/x86/convolution_x86.cpp | 18 +++++++++--------- .../x86/convolutiondepthwise_3x3_pack8_fp16.h | 4 ++-- src/layer/x86/convolutiondepthwise_x86.cpp | 6 +++--- src/layer/x86/innerproduct_x86.cpp | 8 ++++---- 11 files changed, 52 insertions(+), 50 deletions(-) diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp index 6c3ba6ad6103..efb9df78758f 100644 --- a/src/layer/x86/cast_x86.cpp +++ b/src/layer/x86/cast_x86.cpp @@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); @@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) for (int i = 0; i < remain; i++) mask.m256i_u32[i] = 0x80000000; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 4 && type_to == 1) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const unsigned short* ptr = bottom_blob.channel(q); @@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } if (type_from == 1 && type_to == 4) { -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { const float* ptr = bottom_blob.channel(q); diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h index e75a3c107e3d..d832d4b78d2e 100644 --- a/src/layer/x86/convolution_1x1_pack8.h +++ b/src/layer/x86/convolution_1x1_pack8.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con { int nn_size = size / 12; int remain_size_start = nn_size * 12; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } nn_size = (size - remain_size_start) >> 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } remain_size_start += nn_size << 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con } } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h index 5642e4a210d2..eb86507ff2fc 100644 --- a/src/layer/x86/convolution_1x1_pack8_fp16.h +++ b/src/layer/x86/convolution_1x1_pack8_fp16.h @@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob { int nn_size = size / 12; int remain_size_start = nn_size * 12; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = ii * 12; @@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } nn_size = (size - remain_size_start) >> 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 8; @@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 3; nn_size = (size - remain_size_start) >> 2; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 4; @@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob remain_size_start += nn_size << 2; nn_size = (size - remain_size_start) >> 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int ii = 0; ii < nn_size; ii++) { int i = remain_size_start + ii * 2; @@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } remain_size_start += nn_size << 1; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int i = remain_size_start; i < size; i++) { const float* img0 = bottom_blob.channel(0); @@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob } } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out = top_blob.channel(p); @@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons Mat bottom_blob_shrinked; bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < channels; p++) { const float* r0 = bottom_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h index 0e59af34ec3e..79e6716acf46 100644 --- a/src/layer/x86/convolution_2x2_pack8.h +++ b/src/layer/x86/convolution_2x2_pack8.h @@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat int outch = top_blob.c; const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h index 168bc77da425..68bbcfe02de7 100644 --- a/src/layer/x86/convolution_2x2_pack8_fp16.h +++ b/src/layer/x86/convolution_2x2_pack8_fp16.h @@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons int outch = top_blob.c; const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { Mat out0 = top_blob.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h index a4bc42bda87e..923779d11a0f 100644 --- a/src/layer/x86/convolution_3x3_pack8.h +++ b/src/layer/x86/convolution_3x3_pack8.h @@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; + {0.0f, 0.0f, 1.0f} + }; -#pragma omp parallel for + #pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -624,7 +625,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1332,7 +1333,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h index 0bd0f5047588..8d372d0c136b 100644 --- a/src/layer/x86/convolution_3x3_pack8_fp16.h +++ b/src/layer/x86/convolution_3x3_pack8_fp16.h @@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern {1.0f / 90, -1.0f / 45, 2.0f / 45}, {1.0f / 45, 1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180}, - {0.0f, 0.0f, 1.0f}}; + {0.0f, 0.0f, 1.0f} + }; -#pragma omp parallel for + #pragma omp parallel for for (int p = 0; p < outch; p++) { for (int q = 0; q < inch; q++) @@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < inch; q++) { const Mat img0 = bottom_blob_bordered.channel(q); @@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top else // if (tiles >= 1) bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int r = 0; r < 64; r++) { Mat tm2 = bottom_blob_tm2.channel(r); @@ -623,7 +624,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top // permute end top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { float* output0_tm = top_blob_tm.channel(p); @@ -1331,7 +1332,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top int h_tm = outh / 6 * 8; const int tiles = w_tm / 8 * h_tm / 8; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outch; p++) { const Mat out0_tm = top_blob_tm.channel(p); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index c7c3d006a653..84d3b54719d7 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -686,7 +686,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -775,7 +775,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { float* outptr = top_blob.channel(p); @@ -833,7 +833,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -920,7 +920,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option else { // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { float* outptr = top_blob.channel(p); @@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt) use_winograd3x3_int8 = false; if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 - && num_input >= 16 && num_output >= 16) + && num_input >= 16 && num_output >= 16) { // winograd is slow on small channel count use_winograd3x3_int8 = true; @@ -1072,7 +1072,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt); // requantize, reverse scale inplace -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1122,7 +1122,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt); // dequantize, reverse scale inplace -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output; p++) { Option opt_g = opt; @@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, if (inner_top_blob.empty()) return -100; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < bottom_blob.c; c++) { float* outptr = inner_bottom_blob.channel(c); @@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, opt_g.blob_allocator = inner_top_blob.allocator; convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g); -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int c = 0; c < num_output; c++) { float* outptr = (float*)top_blob.channel(c) + x * outw + y; diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h index b23ea4b41ffe..41e181352b5f 100644 --- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h +++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h @@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); @@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co const float* bias = _bias; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Mat out = top_blob.channel(g); diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index 03ffb1011f66..fafca48e3e1a 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -505,7 +505,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; // quantize, scale and round to nearest -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { Option opt_g = opt; @@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_ const int channels_g = channels / group; const int num_output_g = num_output / group; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g); diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp index 1d7633a549b8..ab551332749c 100644 --- a/src/layer/x86/innerproduct_x86.cpp +++ b/src/layer/x86/innerproduct_x86.cpp @@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -243,7 +243,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, } // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f; @@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, int nn_num_output = num_output >> 3; int remain_num_output_start = nn_num_output << 3; -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_num_output; pp++) { int p = pp * 8; @@ -397,7 +397,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob, } // num_output -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_num_output_start; p < num_output; p++) { float sum = 0.f;