From 756f5aa23d0bd870817f33a0a1d5eeb8c09e7dde Mon Sep 17 00:00:00 2001
From: "Restyled.io" <commits@restyled.io>
Date: Wed, 24 Jun 2020 09:11:02 +0000
Subject: [PATCH 1/4] Restyled by clang-format

---
 src/layer/x86/avx_usability.h                 |   3 +-
 src/layer/x86/cast_x86.cpp                    |   8 +-
 src/layer/x86/convolution_1x1_pack8.h         | 174 ++--
 src/layer/x86/convolution_1x1_pack8_fp16.h    | 178 ++--
 src/layer/x86/convolution_2x2_pack8.h         |   2 +-
 src/layer/x86/convolution_2x2_pack8_fp16.h    |   4 +-
 src/layer/x86/convolution_3x3_pack8.h         |  14 +-
 src/layer/x86/convolution_3x3_pack8_fp16.h    |  16 +-
 src/layer/x86/convolution_x86.cpp             |  86 +-
 .../x86/convolutiondepthwise_3x3_pack8_fp16.h |   6 +-
 src/layer/x86/convolutiondepthwise_x86.cpp    |  25 +-
 src/layer/x86/innerproduct_x86.cpp            | 863 +++++++++---------
 src/layer/x86/innerproduct_x86.h              |  14 +-
 tests/testutil.h                              |   1 -
 14 files changed, 708 insertions(+), 686 deletions(-)
 mode change 100755 => 100644 src/layer/x86/innerproduct_x86.cpp

diff --git a/src/layer/x86/avx_usability.h b/src/layer/x86/avx_usability.h
index c35b84dcd559..e05b7554253c 100644
--- a/src/layer/x86/avx_usability.h
+++ b/src/layer/x86/avx_usability.h
@@ -18,7 +18,8 @@
 #define AVX_USABILITY
 #include <immintrin.h>
 
-static inline __m256 loadfp16(const unsigned short* ptr) {
+static inline __m256 loadfp16(const unsigned short* ptr)
+{
     return _mm256_cvtph_ps(_mm_load_si128((__m128i*)(ptr)));
 }
 static inline __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c)
diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp
index efb9df78758f..6c3ba6ad6103 100644
--- a/src/layer/x86/cast_x86.cpp
+++ b/src/layer/x86/cast_x86.cpp
@@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 4 && type_to == 1)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 1 && type_to == 4)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h
index 6665741faa9e..e75a3c107e3d 100644
--- a/src/layer/x86/convolution_1x1_pack8.h
+++ b/src/layer/x86/convolution_1x1_pack8.h
@@ -14,7 +14,6 @@
 
 static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output)
 {
-   
     // src = kw-kh-inch-outch
     // dst = 8b-8a-kw-kh-inch/8a-outch/8b
     Mat weight_data_r2 = kernel.reshape(1, num_input, num_output);
@@ -109,90 +108,89 @@ static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& w
             const float* k77 = k7.row(p + 7);
 
             float* g00 = g0.row(p / 8);
-                g00[0] = k00[0];
-                g00[1] = k10[0];
-                g00[2] = k20[0];
-                g00[3] = k30[0];
-                g00[4] = k40[0];
-                g00[5] = k50[0];
-                g00[6] = k60[0];
-                g00[7] = k70[0];
-                g00 += 8;
-                g00[0] = k01[0];
-                g00[1] = k11[0];
-                g00[2] = k21[0];
-                g00[3] = k31[0];
-                g00[4] = k41[0];
-                g00[5] = k51[0];
-                g00[6] = k61[0];
-                g00[7] = k71[0];
-
-                g00 += 8;
-                g00[0] = k02[0];
-                g00[1] = k12[0];
-                g00[2] = k22[0];
-                g00[3] = k32[0];
-                g00[4] = k42[0];
-                g00[5] = k52[0];
-                g00[6] = k62[0];
-                g00[7] = k72[0];
-
-                g00 += 8;
-                g00[0] = k03[0];
-                g00[1] = k13[0];
-                g00[2] = k23[0];
-                g00[3] = k33[0];
-                g00[4] = k43[0];
-                g00[5] = k53[0];
-                g00[6] = k63[0];
-                g00[7] = k73[0];
-
-                g00 += 8;
-                g00[0] = k04[0];
-                g00[1] = k14[0];
-                g00[2] = k24[0];
-                g00[3] = k34[0];
-                g00[4] = k44[0];
-                g00[5] = k54[0];
-                g00[6] = k64[0];
-                g00[7] = k74[0];
-
-                g00 += 8;
-                g00[0] = k05[0];
-                g00[1] = k15[0];
-                g00[2] = k25[0];
-                g00[3] = k35[0];
-                g00[4] = k45[0];
-                g00[5] = k55[0];
-                g00[6] = k65[0];
-                g00[7] = k75[0];
-
-                g00 += 8;
-                g00[0] = k06[0];
-                g00[1] = k16[0];
-                g00[2] = k26[0];
-                g00[3] = k36[0];
-                g00[4] = k46[0];
-                g00[5] = k56[0];
-                g00[6] = k66[0];
-                g00[7] = k76[0];
-
-                g00 += 8;
-                g00[0] = k07[0];
-                g00[1] = k17[0];
-                g00[2] = k27[0];
-                g00[3] = k37[0];
-                g00[4] = k47[0];
-                g00[5] = k57[0];
-                g00[6] = k67[0];
-                g00[7] = k77[0];
-
-                g00 += 8;
+            g00[0] = k00[0];
+            g00[1] = k10[0];
+            g00[2] = k20[0];
+            g00[3] = k30[0];
+            g00[4] = k40[0];
+            g00[5] = k50[0];
+            g00[6] = k60[0];
+            g00[7] = k70[0];
+            g00 += 8;
+            g00[0] = k01[0];
+            g00[1] = k11[0];
+            g00[2] = k21[0];
+            g00[3] = k31[0];
+            g00[4] = k41[0];
+            g00[5] = k51[0];
+            g00[6] = k61[0];
+            g00[7] = k71[0];
+
+            g00 += 8;
+            g00[0] = k02[0];
+            g00[1] = k12[0];
+            g00[2] = k22[0];
+            g00[3] = k32[0];
+            g00[4] = k42[0];
+            g00[5] = k52[0];
+            g00[6] = k62[0];
+            g00[7] = k72[0];
+
+            g00 += 8;
+            g00[0] = k03[0];
+            g00[1] = k13[0];
+            g00[2] = k23[0];
+            g00[3] = k33[0];
+            g00[4] = k43[0];
+            g00[5] = k53[0];
+            g00[6] = k63[0];
+            g00[7] = k73[0];
+
+            g00 += 8;
+            g00[0] = k04[0];
+            g00[1] = k14[0];
+            g00[2] = k24[0];
+            g00[3] = k34[0];
+            g00[4] = k44[0];
+            g00[5] = k54[0];
+            g00[6] = k64[0];
+            g00[7] = k74[0];
+
+            g00 += 8;
+            g00[0] = k05[0];
+            g00[1] = k15[0];
+            g00[2] = k25[0];
+            g00[3] = k35[0];
+            g00[4] = k45[0];
+            g00[5] = k55[0];
+            g00[6] = k65[0];
+            g00[7] = k75[0];
+
+            g00 += 8;
+            g00[0] = k06[0];
+            g00[1] = k16[0];
+            g00[2] = k26[0];
+            g00[3] = k36[0];
+            g00[4] = k46[0];
+            g00[5] = k56[0];
+            g00[6] = k66[0];
+            g00[7] = k76[0];
+
+            g00 += 8;
+            g00[0] = k07[0];
+            g00[1] = k17[0];
+            g00[2] = k27[0];
+            g00[3] = k37[0];
+            g00[4] = k47[0];
+            g00[5] = k57[0];
+            g00[6] = k67[0];
+            g00[7] = k77[0];
+
+            g00 += 8;
         }
     }
 }
 
-
 static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
@@ -212,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -253,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -290,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -317,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -339,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         }
 
         remain_size_start += nn_size << 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -355,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
     }
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1016,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h
index 222ffff8f3e9..5642e4a210d2 100644
--- a/src/layer/x86/convolution_1x1_pack8_fp16.h
+++ b/src/layer/x86/convolution_1x1_pack8_fp16.h
@@ -13,7 +13,7 @@
 // specific language governing permissions and limitations under the License.
 static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output)
 {
-     // src = kw-kh-inch-outch
+    // src = kw-kh-inch-outch
     // dst = 8b-8a-kw-kh-inch/8a-outch/8b
     Mat weight_data_r2 = kernel.reshape(1, num_input, num_output);
 
@@ -106,92 +106,90 @@ static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, M
             const float* k76 = k7.row(p + 6);
             const float* k77 = k7.row(p + 7);
 
-            unsigned short* g00 =(unsigned short*) g0.row(p / 8);
-                g00[0] = float32_to_float16(k00[0]);
-                g00[1] = float32_to_float16(k10[0]);
-                g00[2] = float32_to_float16(k20[0]);
-                g00[3] = float32_to_float16(k30[0]);
-                g00[4] = float32_to_float16(k40[0]);
-                g00[5] = float32_to_float16(k50[0]);
-                g00[6] = float32_to_float16(k60[0]);
-                g00[7] = float32_to_float16(k70[0]);
-                g00 += 8;
-                g00[0] = float32_to_float16(k01[0]);
-                g00[1] = float32_to_float16(k11[0]);
-                g00[2] = float32_to_float16(k21[0]);
-                g00[3] = float32_to_float16(k31[0]);
-                g00[4] = float32_to_float16(k41[0]);
-                g00[5] = float32_to_float16(k51[0]);
-                g00[6] = float32_to_float16(k61[0]);
-                g00[7] = float32_to_float16(k71[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k02[0]);
-                g00[1] = float32_to_float16(k12[0]);
-                g00[2] = float32_to_float16(k22[0]);
-                g00[3] = float32_to_float16(k32[0]);
-                g00[4] = float32_to_float16(k42[0]);
-                g00[5] = float32_to_float16(k52[0]);
-                g00[6] = float32_to_float16(k62[0]);
-                g00[7] = float32_to_float16(k72[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k03[0]);
-                g00[1] = float32_to_float16(k13[0]);
-                g00[2] = float32_to_float16(k23[0]);
-                g00[3] = float32_to_float16(k33[0]);
-                g00[4] = float32_to_float16(k43[0]);
-                g00[5] = float32_to_float16(k53[0]);
-                g00[6] = float32_to_float16(k63[0]);
-                g00[7] = float32_to_float16(k73[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k04[0]);
-                g00[1] = float32_to_float16(k14[0]);
-                g00[2] = float32_to_float16(k24[0]);
-                g00[3] = float32_to_float16(k34[0]);
-                g00[4] = float32_to_float16(k44[0]);
-                g00[5] = float32_to_float16(k54[0]);
-                g00[6] = float32_to_float16(k64[0]);
-                g00[7] = float32_to_float16(k74[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k05[0]);
-                g00[1] = float32_to_float16(k15[0]);
-                g00[2] = float32_to_float16(k25[0]);
-                g00[3] = float32_to_float16(k35[0]);
-                g00[4] = float32_to_float16(k45[0]);
-                g00[5] = float32_to_float16(k55[0]);
-                g00[6] = float32_to_float16(k65[0]);
-                g00[7] = float32_to_float16(k75[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k06[0]);
-                g00[1] = float32_to_float16(k16[0]);
-                g00[2] = float32_to_float16(k26[0]);
-                g00[3] = float32_to_float16(k36[0]);
-                g00[4] = float32_to_float16(k46[0]);
-                g00[5] = float32_to_float16(k56[0]);
-                g00[6] = float32_to_float16(k66[0]);
-                g00[7] = float32_to_float16(k76[0]);
-
-                g00 += 8;
-                g00[0] = float32_to_float16(k07[0]);
-                g00[1] = float32_to_float16(k17[0]);
-                g00[2] = float32_to_float16(k27[0]);
-                g00[3] = float32_to_float16(k37[0]);
-                g00[4] = float32_to_float16(k47[0]);
-                g00[5] = float32_to_float16(k57[0]);
-                g00[6] = float32_to_float16(k67[0]);
-                g00[7] = float32_to_float16(k77[0]);
-
-                g00 += 8;
+            unsigned short* g00 = (unsigned short*)g0.row(p / 8);
+            g00[0] = float32_to_float16(k00[0]);
+            g00[1] = float32_to_float16(k10[0]);
+            g00[2] = float32_to_float16(k20[0]);
+            g00[3] = float32_to_float16(k30[0]);
+            g00[4] = float32_to_float16(k40[0]);
+            g00[5] = float32_to_float16(k50[0]);
+            g00[6] = float32_to_float16(k60[0]);
+            g00[7] = float32_to_float16(k70[0]);
+            g00 += 8;
+            g00[0] = float32_to_float16(k01[0]);
+            g00[1] = float32_to_float16(k11[0]);
+            g00[2] = float32_to_float16(k21[0]);
+            g00[3] = float32_to_float16(k31[0]);
+            g00[4] = float32_to_float16(k41[0]);
+            g00[5] = float32_to_float16(k51[0]);
+            g00[6] = float32_to_float16(k61[0]);
+            g00[7] = float32_to_float16(k71[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k02[0]);
+            g00[1] = float32_to_float16(k12[0]);
+            g00[2] = float32_to_float16(k22[0]);
+            g00[3] = float32_to_float16(k32[0]);
+            g00[4] = float32_to_float16(k42[0]);
+            g00[5] = float32_to_float16(k52[0]);
+            g00[6] = float32_to_float16(k62[0]);
+            g00[7] = float32_to_float16(k72[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k03[0]);
+            g00[1] = float32_to_float16(k13[0]);
+            g00[2] = float32_to_float16(k23[0]);
+            g00[3] = float32_to_float16(k33[0]);
+            g00[4] = float32_to_float16(k43[0]);
+            g00[5] = float32_to_float16(k53[0]);
+            g00[6] = float32_to_float16(k63[0]);
+            g00[7] = float32_to_float16(k73[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k04[0]);
+            g00[1] = float32_to_float16(k14[0]);
+            g00[2] = float32_to_float16(k24[0]);
+            g00[3] = float32_to_float16(k34[0]);
+            g00[4] = float32_to_float16(k44[0]);
+            g00[5] = float32_to_float16(k54[0]);
+            g00[6] = float32_to_float16(k64[0]);
+            g00[7] = float32_to_float16(k74[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k05[0]);
+            g00[1] = float32_to_float16(k15[0]);
+            g00[2] = float32_to_float16(k25[0]);
+            g00[3] = float32_to_float16(k35[0]);
+            g00[4] = float32_to_float16(k45[0]);
+            g00[5] = float32_to_float16(k55[0]);
+            g00[6] = float32_to_float16(k65[0]);
+            g00[7] = float32_to_float16(k75[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k06[0]);
+            g00[1] = float32_to_float16(k16[0]);
+            g00[2] = float32_to_float16(k26[0]);
+            g00[3] = float32_to_float16(k36[0]);
+            g00[4] = float32_to_float16(k46[0]);
+            g00[5] = float32_to_float16(k56[0]);
+            g00[6] = float32_to_float16(k66[0]);
+            g00[7] = float32_to_float16(k76[0]);
+
+            g00 += 8;
+            g00[0] = float32_to_float16(k07[0]);
+            g00[1] = float32_to_float16(k17[0]);
+            g00[2] = float32_to_float16(k27[0]);
+            g00[3] = float32_to_float16(k37[0]);
+            g00[4] = float32_to_float16(k47[0]);
+            g00[5] = float32_to_float16(k57[0]);
+            g00[6] = float32_to_float16(k67[0]);
+            g00[7] = float32_to_float16(k77[0]);
+
+            g00 += 8;
         }
     }
 }
 
-
-
 static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     int w = bottom_blob.w;
@@ -212,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -253,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -290,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -317,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -339,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         }
 
         remain_size_start += nn_size << 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -355,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
     }
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1016,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h
index 79e6716acf46..0e59af34ec3e 100644
--- a/src/layer/x86/convolution_2x2_pack8.h
+++ b/src/layer/x86/convolution_2x2_pack8.h
@@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     int outch = top_blob.c;
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h
index 0d87ef826363..168bc77da425 100644
--- a/src/layer/x86/convolution_2x2_pack8_fp16.h
+++ b/src/layer/x86/convolution_2x2_pack8_fp16.h
@@ -106,7 +106,7 @@ static void conv2x2s1_weight_fp16_pack8_avx(const Mat& kernel, Mat& kernel_tm_pa
             const float* k76 = k7.row(p + 6);
             const float* k77 = k7.row(p + 7);
 
-            unsigned short* g00 =(unsigned short*) g0.row(p / 8);
+            unsigned short* g00 = (unsigned short*)g0.row(p / 8);
 
             for (int k = 0; k < 4; k++)
             {
@@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     int outch = top_blob.c;
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h
index 2c9ab4ee3af9..a4bc42bda87e 100644
--- a/src/layer/x86/convolution_3x3_pack8.h
+++ b/src/layer/x86/convolution_3x3_pack8.h
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-
 static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch)
 {
     // winograd63 transform kernel
@@ -27,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+        {0.0f, 0.0f, 1.0f}};
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -300,7 +298,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -487,7 +485,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -626,7 +624,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1334,7 +1332,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h
index a77d91b65f06..0bd0f5047588 100644
--- a/src/layer/x86/convolution_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolution_3x3_pack8_fp16.h
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-
 static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch)
 {
     // winograd63 transform kernel
@@ -27,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+        {0.0f, 0.0f, 1.0f}};
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -300,7 +298,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -465,7 +463,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
     }
     bottom_blob_bordered = Mat();
     // END transform input
-    
+
     // BEGIN dot
     Mat top_blob_tm;
     {
@@ -487,7 +485,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -625,7 +623,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // permute end
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1333,7 +1331,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 3c5af335a9e6..95ff6f2c5071 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -191,24 +191,24 @@ int Convolution_x86::create_pipeline(const Option& opt)
     // pack8
     if (elempack == 8 && out_elempack == 8)
     {
-        if (opt.use_fp16_storage &&kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
             conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output);
         }
         else if (opt.use_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
             conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output);
-
         }
         else if (opt.use_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
         {
             conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output);
-
-        } else if (opt.use_fp16_storage && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        }
+        else if (opt.use_fp16_storage && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
             conv2x2s1_weight_fp16_pack8_avx(weight_data, weight_data_pack8, num_input, num_output);
-
-        } else {
+        }
+        else
+        {
             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             {
                 conv3x3s1_winograd64_transform_kernel_pack8_avx(weight_data, weight_data_pack8, num_input, num_output);
@@ -604,12 +604,14 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 #if __AVX__
     if (elempack == 8 && out_elempack == 8)
     {
-
         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
-            if (opt.use_fp16_storage) {
+            if (opt.use_fp16_storage)
+            {
                 conv1x1s1_sgemm_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-            } else {
+            }
+            else
+            {
                 conv1x1s1_sgemm_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
             }
 
@@ -620,12 +622,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
         {
-            if (opt.use_fp16_storage) {
+            if (opt.use_fp16_storage)
+            {
                 conv1x1s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-
-            } else {
+            }
+            else
+            {
                 conv1x1s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-
             }
             if (activation)
             {
@@ -634,12 +637,13 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
         {
-            if (opt.use_fp16_storage) {
+            if (opt.use_fp16_storage)
+            {
                 conv1x1s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-
-            } else {
+            }
+            else
+            {
                 conv1x1s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-
             }
 
             if (activation)
@@ -649,9 +653,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
-            if (opt.use_fp16_storage) {
+            if (opt.use_fp16_storage)
+            {
                 conv3x3s1_winograd64_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-            } else {
+            }
+            else
+            {
                 conv3x3s1_winograd64_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
             }
 
@@ -662,9 +669,12 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else if (kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
-            if (opt.use_fp16_storage) {
+            if (opt.use_fp16_storage)
+            {
                 conv2x2s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-            } else {
+            }
+            else
+            {
                 conv2x2s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
             }
 
@@ -675,8 +685,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-            // num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -764,8 +774,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-            // num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -822,8 +832,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-            // num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -909,8 +919,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-            // num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -991,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
     use_winograd3x3_int8 = false;
 
     if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1
-            && num_input >= 16 && num_output >= 16)
+        && num_input >= 16 && num_output >= 16)
     {
         // winograd is slow on small channel count
         use_winograd3x3_int8 = true;
@@ -1059,10 +1069,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         if (use_winograd3x3_int8)
         {
             conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
-            //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
+//             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
 
-            // requantize, reverse scale inplace
-            #pragma omp parallel for num_threads(opt.num_threads)
+// requantize, reverse scale inplace
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1109,10 +1119,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         if (use_winograd3x3_int8)
         {
             conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
-            //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
+//             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
 
-            // dequantize, reverse scale inplace
-            #pragma omp parallel for num_threads(opt.num_threads)
+// dequantize, reverse scale inplace
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1196,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             if (inner_top_blob.empty())
                 return -100;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c++)
             {
                 float* outptr = inner_bottom_blob.channel(c);
@@ -1216,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             opt_g.blob_allocator = inner_top_blob.allocator;
             convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
 
-            #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c++)
             {
                 float* outptr = (float*)top_blob.channel(c) + x * outw + y;
diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
index f4eeabb923ba..b23ea4b41ffe 100644
--- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
@@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -234,14 +234,14 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
 
         __m256 _bias0 = bias ? _mm256_loadu_ps((const float*)bias + g * 8) : _mm256_set1_ps(0.f);
 
-        const unsigned short* k0 =(const unsigned short* )kernel.row(g);
+        const unsigned short* k0 = (const unsigned short*)kernel.row(g);
 
         float* outptr0 = out.row(0);
         float* outptr1 = out.row(1);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index d277aa5d1964..c9b624161895 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -118,7 +118,6 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
         // pack8
         if (elempack == 8)
         {
-
             if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             {
                 Mat weight_data_r2 = weight_data.reshape(maxk, group);
@@ -127,7 +126,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_pack8, opt);
                 return 0;
             }
-            if (opt.use_fp16_storage &&  kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            if (opt.use_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
             {
                 Mat weight_data_r2 = weight_data.reshape(maxk, group);
                 Mat weight_data_tmp;
@@ -289,9 +288,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
         {
             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             {
-                if (opt.use_fp16_storage) {
+                if (opt.use_fp16_storage)
+                {
                     convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-                } else {
+                }
+                else
+                {
                     convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
                 }
 
@@ -304,9 +306,12 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
             }
             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
             {
-                if (opt.use_fp16_storage) {
+                if (opt.use_fp16_storage)
+                {
                     convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
-                } else {
+                }
+                else
+                {
                     convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_pack8, bias_data, opt);
                 }
 
@@ -362,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     }
                 }
 
-                #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
                 for (int g = 0; g < channels; g++)
                 {
                     float* outptr = top_blob.channel(g);
@@ -499,8 +504,8 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
 
         const int channels_g = channels / group;
 
-        // quantize, scale and round to nearest
-        #pragma omp parallel for num_threads(opt.num_threads)
+// quantize, scale and round to nearest
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int g = 0; g < group; g++)
         {
             Option opt_g = opt;
@@ -614,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
     const int channels_g = channels / group;
     const int num_output_g = num_output / group;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
old mode 100755
new mode 100644
index 416365eb1046..0135fb26f157
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -1,425 +1,438 @@
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-#include <algorithm>
-
-#ifdef __AVX__
-#include "avx_activation.h"
-#include "avx_usability.h"
-#endif // NCNN_AVX2
-
-#include "innerproduct_x86.h"
-
-#include "layer_type.h"
-
-
-namespace ncnn {
-
-DEFINE_LAYER_CREATOR(InnerProduct_x86)
-
-
-InnerProduct_x86::InnerProduct_x86()
-{
-#if __AVX__
-    support_packing = true;
-#endif // __AVX__
-
-    flatten = 0;
-}
-
-int InnerProduct_x86::create_pipeline(const Option& opt)
-{
-#if __AVX__
-    if (opt.use_packing_layout)
-    {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
-
-        ncnn::ParamDict pd;
-
-        flatten->load_param(pd);
-
-        flatten->create_pipeline(opt);
-    }
-    if (opt.use_fp16_storage && weight_data.elemsize == 4u)
-    {
-        ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt);
-    }
-#endif // __AVX__
-
-
-    return 0;
-}
-
-int InnerProduct_x86::destroy_pipeline(const Option& opt)
-{
-    if (flatten)
-    {
-        flatten->destroy_pipeline(opt);
-        delete flatten;
-        flatten = 0;
-    }
-
-    return 0;
-}
-
-int InnerProduct_x86::forward(const Mat &bottom_blob, Mat &top_blob,
-                              const Option &opt) const {
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
-    {
-        // TODO
-        return InnerProduct::forward(bottom_blob, top_blob, opt);
-    }
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-    int size = w * h;
-    // fprintf(stderr, "bottom_blob %d x %d x %d, elempack = %d \n", w,h,channels,elempack);
-#if __AVX__
-    if (elempack == 8)
-    {
-        // flatten
-        Mat bottom_blob_flattened = bottom_blob;
-        if (bottom_blob.dims != 1)
-        {
-            Option opt_flatten = opt;
-            opt_flatten.blob_allocator = opt.workspace_allocator;
-
-            flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
-        }
-
-        // pack1
-        {
-            bottom_blob_flattened.w *= bottom_blob_flattened.elempack;
-            bottom_blob_flattened.cstep = bottom_blob_flattened.w;
-            bottom_blob_flattened.elemsize = 4u;
-            bottom_blob_flattened.elempack = 1;
-        }
-        if ( opt.use_fp16_storage) {
-            return forward_fp16(bottom_blob_flattened, top_blob, opt);
-        } else {
-            return forward(bottom_blob_flattened, top_blob, opt);
-        }
-    }
-
-    if (size % 8 == 0 &&  opt.use_fp16_storage) {
-            return forward_fp16(bottom_blob, top_blob, opt);
-    }
-    top_blob.create(num_output, elemsize, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-    
-
-    const float *weight_data_ptr = weight_data;
-
-    int nn_num_output = num_output >> 3;
-    int remain_num_output_start = nn_num_output << 3;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp = 0; pp < nn_num_output; pp++) {
-        int p = pp * 8;
-
-        float sums[8] = {0.0f};
-        if (bias_term) {
-            sums[0] = bias_data[p];
-            sums[1] = bias_data[p + 1];
-            sums[2] = bias_data[p + 2];
-            sums[3] = bias_data[p + 3];
-            sums[4] = bias_data[p + 4];
-            sums[5] = bias_data[p + 5];
-            sums[6] = bias_data[p + 6];
-            sums[7] = bias_data[p + 7];
-        }
-        __m256 _sum0 = _mm256_set1_ps(0.f);
-        __m256 _sum1 = _mm256_set1_ps(0.f);
-        __m256 _sum2 = _mm256_set1_ps(0.f);
-        __m256 _sum3 = _mm256_set1_ps(0.f);
-        __m256 _sum4 = _mm256_set1_ps(0.f);
-        __m256 _sum5 = _mm256_set1_ps(0.f);
-        __m256 _sum6 = _mm256_set1_ps(0.f);
-        __m256 _sum7 = _mm256_set1_ps(0.f);
-
-        const float *w0 = weight_data_ptr + size * channels * p;
-        const float *w1 = weight_data_ptr + size * channels * (p + 1);
-        const float *w2 = weight_data_ptr + size * channels * (p + 2);
-        const float *w3 = weight_data_ptr + size * channels * (p + 3);
-        const float *w4 = weight_data_ptr + size * channels * (p + 4);
-        const float *w5 = weight_data_ptr + size * channels * (p + 5);
-        const float *w6 = weight_data_ptr + size * channels * (p + 6);
-        const float *w7 = weight_data_ptr + size * channels * (p + 7);
-
-
-        // channels
-        for (int q = 0; q < channels; q++) {
-            const float *m = bottom_blob.channel(q);
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            for (; nn > 0; nn--) {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w0 = _mm256_loadu_ps(w0);
-                _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0);
-
-                __m256 _w1 = _mm256_loadu_ps(w1);
-                _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1);
-
-                __m256 _w2 = _mm256_loadu_ps(w2);
-                _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2);
-
-                __m256 _w3 = _mm256_loadu_ps(w3);
-                _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3);
-
-                __m256 _w4 = _mm256_loadu_ps(w4);
-                _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4);
-
-                __m256 _w5 = _mm256_loadu_ps(w5);
-                _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5);
-
-                __m256 _w6 = _mm256_loadu_ps(w6);
-                _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6);
-
-                __m256 _w7 = _mm256_loadu_ps(w7);
-                _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7);
-
-                m += 8;
-                w0 += 8;
-                w1 += 8;
-                w2 += 8;
-                w3 += 8;
-                w4 += 8;
-                w5 += 8;
-                w6 += 8;
-                w7 += 8;
-            }
-
-
-            for (; remain > 0; remain--) {
-                sums[0] += *m * *w0;
-                sums[1] += *m * *w1;
-                sums[2] += *m * *w2;
-                sums[3] += *m * *w3;
-                sums[4] += *m * *w4;
-                sums[5] += *m * *w5;
-                sums[6] += *m * *w6;
-                sums[7] += *m * *w7;
-
-                m++;
-                w0++;
-                w1++;
-                w2++;
-                w3++;
-                w4++;
-                w5++;
-                w6++;
-                w7++;
-            }
-            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5,
-                                          _sum6, _sum7);
-            __m256 _sums_f = _mm256_loadu_ps(&sums[0]);
-            _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type,
-                                  activation_params);
-            _mm256_storeu_ps(&top_blob[p], _sums);
-        }
-    }
-
-// num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = remain_num_output_start; p < num_output; p++) {
-        float sum = 0.f;
-
-        if (bias_term)
-            sum = bias_data[p];
-
-        const float *w = weight_data_ptr + size * channels * p;
-
-        __m256 _sum = _mm256_set1_ps(0.f);
-        // channels
-        for (int q = 0; q < channels; q++) {
-            const float *m = bottom_blob.channel(q);
-
-            int nn = size >> 3;
-            int remain = size & 7;
-            for (; nn > 0; nn--) {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w = _mm256_loadu_ps(w);
-                _sum = _mm256_fmadd_ps(_m, _w, _sum);
-
-                m += 8;
-                w += 8;
-            }
-            for (; remain > 0; remain--) {
-                sum += *m * *w;
-                m++;
-                w++;
-            }
-        }
-
-        sum += _mm256_reduce_add_ps(_sum);
-        sum = activation_ss(sum, activation_type, activation_params);
-
-        top_blob[p] = sum;
-    }
-    return 0;
-#else
-    return InnerProduct::forward(bottom_blob, top_blob, opt);
-#endif // __AVX__
-}
-#if __AVX__
-
-int InnerProduct_x86::forward_fp16(const Mat &bottom_blob, Mat &top_blob,
-                                   const Option &opt) const {
-
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int channels = bottom_blob.c;
-    size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-    int size = w * h;
-
-    top_blob.create(num_output, elemsize, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    const unsigned short *weight_data_ptr = (const unsigned short *)weight_data_fp16;
-    
-    int nn_num_output = num_output >> 3;
-    int remain_num_output_start = nn_num_output << 3;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp = 0; pp < nn_num_output; pp++) {
-        int p = pp*8;
-        float sums[8] = {0.0f};
-        if (bias_term) {
-            sums[0] = bias_data[p];
-            sums[1] = bias_data[p + 1];
-            sums[2] = bias_data[p + 2];
-            sums[3] = bias_data[p + 3];
-            sums[4] = bias_data[p + 4];
-            sums[5] = bias_data[p + 5];
-            sums[6] = bias_data[p + 6];
-            sums[7] = bias_data[p + 7];
-        }
-        __m256 _sum0 = _mm256_set1_ps(0.f);
-        __m256 _sum1 = _mm256_set1_ps(0.f);
-        __m256 _sum2 = _mm256_set1_ps(0.f);
-        __m256 _sum3 = _mm256_set1_ps(0.f);
-        __m256 _sum4 = _mm256_set1_ps(0.f);
-        __m256 _sum5 = _mm256_set1_ps(0.f);
-        __m256 _sum6 = _mm256_set1_ps(0.f);
-        __m256 _sum7 = _mm256_set1_ps(0.f);
-
-        const unsigned short *w0 = weight_data_ptr + size * channels * p;
-        const unsigned short *w1 = weight_data_ptr + size * channels * (p + 1);
-        const unsigned short *w2 = weight_data_ptr + size * channels * (p + 2);
-        const unsigned short *w3 = weight_data_ptr + size * channels * (p + 3);
-        const unsigned short *w4 = weight_data_ptr + size * channels * (p + 4);
-        const unsigned short *w5 = weight_data_ptr + size * channels * (p + 5);
-        const unsigned short *w6 = weight_data_ptr + size * channels * (p + 6);
-        const unsigned short *w7 = weight_data_ptr + size * channels * (p + 7);
-
-
-        // channels
-        for (int q = 0; q < channels; q++) {
-            const float *m = bottom_blob.channel(q);
-            int nn = size >> 3;
-            for (; nn > 0; nn--) {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w0 = loadfp16(w0);
-                _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0);
-
-                __m256 _w1 = loadfp16(w1);
-                _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1);
-
-                __m256 _w2 = loadfp16(w2);
-                _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2);
-
-                __m256 _w3 = loadfp16(w3);
-                _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3);
-
-                __m256 _w4 = loadfp16(w4);
-                _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4);
-
-                __m256 _w5 = loadfp16(w5);
-                _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5);
-
-                __m256 _w6 = loadfp16(w6);
-                _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6);
-
-                __m256 _w7 = loadfp16(w7);
-                _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7);
-
-                m += 8;
-                w0 += 8;
-                w1 += 8;
-                w2 += 8;
-                w3 += 8;
-                w4 += 8;
-                w5 += 8;
-                w6 += 8;
-                w7 += 8;
-            }
-            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5,
-                                          _sum6, _sum7);
-            __m256 _sums_f = _mm256_loadu_ps(&sums[0]);
-            _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type,
-                                  activation_params);
-            _mm256_storeu_ps(&top_blob[p], _sums);
-        }
-    }
-
-// num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = remain_num_output_start; p < num_output; p++) {
-        float sum = 0.f;
-
-        if (bias_term)
-            sum = bias_data[p];
-
-        const unsigned short *w = weight_data_ptr + size * channels * p;
-
-        __m256 _sum = _mm256_set1_ps(0.f);
-        // channels
-        for (int q = 0; q < channels; q++) {
-            const float *m = bottom_blob.channel(q);
-
-            int nn = size >> 3;
-            for (; nn > 0; nn--) {
-                __m256 _m = _mm256_loadu_ps(m);
-
-                __m256 _w = loadfp16(w);
-                _sum = _mm256_fmadd_ps(_m, _w, _sum);
-
-                m += 8;
-                w += 8;
-            }
-        }
-
-        sum += _mm256_reduce_add_ps(_sum);
-        sum = activation_ss(sum, activation_type, activation_params);
-
-        top_blob[p] = sum;
-    }
-    return 0;
-}
-#endif // __ARM_NEON
-
-} // namespace ncnn
+// Tencent is pleased to support the open source community by making ncnn
+// available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this
+// file except in compliance with the License. You may obtain a copy of the
+// License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations under
+// the License.
+#include <algorithm>
+
+#ifdef __AVX__
+#include "avx_activation.h"
+#include "avx_usability.h"
+#endif // NCNN_AVX2
+
+#include "innerproduct_x86.h"
+
+#include "layer_type.h"
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(InnerProduct_x86)
+
+InnerProduct_x86::InnerProduct_x86()
+{
+#if __AVX__
+    support_packing = true;
+#endif // __AVX__
+
+    flatten = 0;
+}
+
+int InnerProduct_x86::create_pipeline(const Option& opt)
+{
+#if __AVX__
+    if (opt.use_packing_layout)
+    {
+        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+
+        ncnn::ParamDict pd;
+
+        flatten->load_param(pd);
+
+        flatten->create_pipeline(opt);
+    }
+    if (opt.use_fp16_storage && weight_data.elemsize == 4u)
+    {
+        ncnn::cast_float32_to_float16(weight_data, weight_data_fp16, opt);
+    }
+#endif // __AVX__
+
+    return 0;
+}
+
+int InnerProduct_x86::destroy_pipeline(const Option& opt)
+{
+    if (flatten)
+    {
+        flatten->destroy_pipeline(opt);
+        delete flatten;
+        flatten = 0;
+    }
+
+    return 0;
+}
+
+int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
+                              const Option& opt) const
+{
+    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
+    {
+        // TODO
+        return InnerProduct::forward(bottom_blob, top_blob, opt);
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h;
+    // fprintf(stderr, "bottom_blob %d x %d x %d, elempack = %d \n", w,h,channels,elempack);
+#if __AVX__
+    if (elempack == 8)
+    {
+        // flatten
+        Mat bottom_blob_flattened = bottom_blob;
+        if (bottom_blob.dims != 1)
+        {
+            Option opt_flatten = opt;
+            opt_flatten.blob_allocator = opt.workspace_allocator;
+
+            flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
+        }
+
+        // pack1
+        {
+            bottom_blob_flattened.w *= bottom_blob_flattened.elempack;
+            bottom_blob_flattened.cstep = bottom_blob_flattened.w;
+            bottom_blob_flattened.elemsize = 4u;
+            bottom_blob_flattened.elempack = 1;
+        }
+        if (opt.use_fp16_storage)
+        {
+            return forward_fp16(bottom_blob_flattened, top_blob, opt);
+        }
+        else
+        {
+            return forward(bottom_blob_flattened, top_blob, opt);
+        }
+    }
+
+    if (size % 8 == 0 && opt.use_fp16_storage)
+    {
+        return forward_fp16(bottom_blob, top_blob, opt);
+    }
+    top_blob.create(num_output, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const float* weight_data_ptr = weight_data;
+
+    int nn_num_output = num_output >> 3;
+    int remain_num_output_start = nn_num_output << 3;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_num_output; pp++)
+    {
+        int p = pp * 8;
+
+        float sums[8] = {0.0f};
+        if (bias_term)
+        {
+            sums[0] = bias_data[p];
+            sums[1] = bias_data[p + 1];
+            sums[2] = bias_data[p + 2];
+            sums[3] = bias_data[p + 3];
+            sums[4] = bias_data[p + 4];
+            sums[5] = bias_data[p + 5];
+            sums[6] = bias_data[p + 6];
+            sums[7] = bias_data[p + 7];
+        }
+        __m256 _sum0 = _mm256_set1_ps(0.f);
+        __m256 _sum1 = _mm256_set1_ps(0.f);
+        __m256 _sum2 = _mm256_set1_ps(0.f);
+        __m256 _sum3 = _mm256_set1_ps(0.f);
+        __m256 _sum4 = _mm256_set1_ps(0.f);
+        __m256 _sum5 = _mm256_set1_ps(0.f);
+        __m256 _sum6 = _mm256_set1_ps(0.f);
+        __m256 _sum7 = _mm256_set1_ps(0.f);
+
+        const float* w0 = weight_data_ptr + size * channels * p;
+        const float* w1 = weight_data_ptr + size * channels * (p + 1);
+        const float* w2 = weight_data_ptr + size * channels * (p + 2);
+        const float* w3 = weight_data_ptr + size * channels * (p + 3);
+        const float* w4 = weight_data_ptr + size * channels * (p + 4);
+        const float* w5 = weight_data_ptr + size * channels * (p + 5);
+        const float* w6 = weight_data_ptr + size * channels * (p + 6);
+        const float* w7 = weight_data_ptr + size * channels * (p + 7);
+
+        // channels
+        for (int q = 0; q < channels; q++)
+        {
+            const float* m = bottom_blob.channel(q);
+            int nn = size >> 3;
+            int remain = size & 7;
+
+            for (; nn > 0; nn--)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+                __m256 _w0 = _mm256_loadu_ps(w0);
+                _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0);
+
+                __m256 _w1 = _mm256_loadu_ps(w1);
+                _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1);
+
+                __m256 _w2 = _mm256_loadu_ps(w2);
+                _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2);
+
+                __m256 _w3 = _mm256_loadu_ps(w3);
+                _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3);
+
+                __m256 _w4 = _mm256_loadu_ps(w4);
+                _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4);
+
+                __m256 _w5 = _mm256_loadu_ps(w5);
+                _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5);
+
+                __m256 _w6 = _mm256_loadu_ps(w6);
+                _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6);
+
+                __m256 _w7 = _mm256_loadu_ps(w7);
+                _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7);
+
+                m += 8;
+                w0 += 8;
+                w1 += 8;
+                w2 += 8;
+                w3 += 8;
+                w4 += 8;
+                w5 += 8;
+                w6 += 8;
+                w7 += 8;
+            }
+
+            for (; remain > 0; remain--)
+            {
+                sums[0] += *m * *w0;
+                sums[1] += *m * *w1;
+                sums[2] += *m * *w2;
+                sums[3] += *m * *w3;
+                sums[4] += *m * *w4;
+                sums[5] += *m * *w5;
+                sums[6] += *m * *w6;
+                sums[7] += *m * *w7;
+
+                m++;
+                w0++;
+                w1++;
+                w2++;
+                w3++;
+                w4++;
+                w5++;
+                w6++;
+                w7++;
+            }
+            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5,
+                                          _sum6, _sum7);
+            __m256 _sums_f = _mm256_loadu_ps(&sums[0]);
+            _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type,
+                                  activation_params);
+            _mm256_storeu_ps(&top_blob[p], _sums);
+        }
+    }
+
+    // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_num_output_start; p < num_output; p++)
+    {
+        float sum = 0.f;
+
+        if (bias_term)
+            sum = bias_data[p];
+
+        const float* w = weight_data_ptr + size * channels * p;
+
+        __m256 _sum = _mm256_set1_ps(0.f);
+        // channels
+        for (int q = 0; q < channels; q++)
+        {
+            const float* m = bottom_blob.channel(q);
+
+            int nn = size >> 3;
+            int remain = size & 7;
+            for (; nn > 0; nn--)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+                __m256 _w = _mm256_loadu_ps(w);
+                _sum = _mm256_fmadd_ps(_m, _w, _sum);
+
+                m += 8;
+                w += 8;
+            }
+            for (; remain > 0; remain--)
+            {
+                sum += *m * *w;
+                m++;
+                w++;
+            }
+        }
+
+        sum += _mm256_reduce_add_ps(_sum);
+        sum = activation_ss(sum, activation_type, activation_params);
+
+        top_blob[p] = sum;
+    }
+    return 0;
+#else
+    return InnerProduct::forward(bottom_blob, top_blob, opt);
+#endif // __AVX__
+}
+#if __AVX__
+
+int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
+                                   const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int size = w * h;
+
+    top_blob.create(num_output, elemsize, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    const unsigned short* weight_data_ptr = (const unsigned short*)weight_data_fp16;
+
+    int nn_num_output = num_output >> 3;
+    int remain_num_output_start = nn_num_output << 3;
+
+#pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_num_output; pp++)
+    {
+        int p = pp * 8;
+        float sums[8] = {0.0f};
+        if (bias_term)
+        {
+            sums[0] = bias_data[p];
+            sums[1] = bias_data[p + 1];
+            sums[2] = bias_data[p + 2];
+            sums[3] = bias_data[p + 3];
+            sums[4] = bias_data[p + 4];
+            sums[5] = bias_data[p + 5];
+            sums[6] = bias_data[p + 6];
+            sums[7] = bias_data[p + 7];
+        }
+        __m256 _sum0 = _mm256_set1_ps(0.f);
+        __m256 _sum1 = _mm256_set1_ps(0.f);
+        __m256 _sum2 = _mm256_set1_ps(0.f);
+        __m256 _sum3 = _mm256_set1_ps(0.f);
+        __m256 _sum4 = _mm256_set1_ps(0.f);
+        __m256 _sum5 = _mm256_set1_ps(0.f);
+        __m256 _sum6 = _mm256_set1_ps(0.f);
+        __m256 _sum7 = _mm256_set1_ps(0.f);
+
+        const unsigned short* w0 = weight_data_ptr + size * channels * p;
+        const unsigned short* w1 = weight_data_ptr + size * channels * (p + 1);
+        const unsigned short* w2 = weight_data_ptr + size * channels * (p + 2);
+        const unsigned short* w3 = weight_data_ptr + size * channels * (p + 3);
+        const unsigned short* w4 = weight_data_ptr + size * channels * (p + 4);
+        const unsigned short* w5 = weight_data_ptr + size * channels * (p + 5);
+        const unsigned short* w6 = weight_data_ptr + size * channels * (p + 6);
+        const unsigned short* w7 = weight_data_ptr + size * channels * (p + 7);
+
+        // channels
+        for (int q = 0; q < channels; q++)
+        {
+            const float* m = bottom_blob.channel(q);
+            int nn = size >> 3;
+            for (; nn > 0; nn--)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+                __m256 _w0 = loadfp16(w0);
+                _sum0 = _mm256_fmadd_ps(_m, _w0, _sum0);
+
+                __m256 _w1 = loadfp16(w1);
+                _sum1 = _mm256_fmadd_ps(_m, _w1, _sum1);
+
+                __m256 _w2 = loadfp16(w2);
+                _sum2 = _mm256_fmadd_ps(_m, _w2, _sum2);
+
+                __m256 _w3 = loadfp16(w3);
+                _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3);
+
+                __m256 _w4 = loadfp16(w4);
+                _sum4 = _mm256_fmadd_ps(_m, _w4, _sum4);
+
+                __m256 _w5 = loadfp16(w5);
+                _sum5 = _mm256_fmadd_ps(_m, _w5, _sum5);
+
+                __m256 _w6 = loadfp16(w6);
+                _sum6 = _mm256_fmadd_ps(_m, _w6, _sum6);
+
+                __m256 _w7 = loadfp16(w7);
+                _sum7 = _mm256_fmadd_ps(_m, _w7, _sum7);
+
+                m += 8;
+                w0 += 8;
+                w1 += 8;
+                w2 += 8;
+                w3 += 8;
+                w4 += 8;
+                w5 += 8;
+                w6 += 8;
+                w7 += 8;
+            }
+            __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5,
+                                          _sum6, _sum7);
+            __m256 _sums_f = _mm256_loadu_ps(&sums[0]);
+            _sums = activation_ps(_mm256_add_ps(_sums_f, _sums), activation_type,
+                                  activation_params);
+            _mm256_storeu_ps(&top_blob[p], _sums);
+        }
+    }
+
+    // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_num_output_start; p < num_output; p++)
+    {
+        float sum = 0.f;
+
+        if (bias_term)
+            sum = bias_data[p];
+
+        const unsigned short* w = weight_data_ptr + size * channels * p;
+
+        __m256 _sum = _mm256_set1_ps(0.f);
+        // channels
+        for (int q = 0; q < channels; q++)
+        {
+            const float* m = bottom_blob.channel(q);
+
+            int nn = size >> 3;
+            for (; nn > 0; nn--)
+            {
+                __m256 _m = _mm256_loadu_ps(m);
+
+                __m256 _w = loadfp16(w);
+                _sum = _mm256_fmadd_ps(_m, _w, _sum);
+
+                m += 8;
+                w += 8;
+            }
+        }
+
+        sum += _mm256_reduce_add_ps(_sum);
+        sum = activation_ss(sum, activation_type, activation_params);
+
+        top_blob[p] = sum;
+    }
+    return 0;
+}
+#endif // __ARM_NEON
+
+} // namespace ncnn
diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h
index dbaf299fb729..a08dd1f820b1 100644
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -22,20 +22,22 @@
 
 namespace ncnn {
 
-class InnerProduct_x86 : virtual public InnerProduct {
+class InnerProduct_x86 : virtual public InnerProduct
+{
 public:
     InnerProduct_x86();
 
-    virtual int create_pipeline(const Option &opt);
-    virtual int destroy_pipeline(const Option &opt);
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob,
+                        const Option& opt) const;
 
-    virtual int forward(const Mat &bottom_blob, Mat &top_blob,
-                        const Option &opt) const;
 protected:
     int forward_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 public:
-    ncnn::Layer *flatten;
+    ncnn::Layer* flatten;
 
     // fp16 weight data
     Mat weight_data_fp16;
diff --git a/tests/testutil.h b/tests/testutil.h
index 14c2083ae609..3703ecc3611f 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -770,7 +770,6 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
     opts[2].use_bf16_storage = true;
     opts[2].use_shader_pack8 = true;
     opts[2].use_image_storage = true;
-    
 
     for (int i = 0; i < 3; i++)
     {

From 6223e511c89625cfc940a959590a04e4efe8a5a1 Mon Sep 17 00:00:00 2001
From: "Restyled.io" <commits@restyled.io>
Date: Wed, 24 Jun 2020 09:11:03 +0000
Subject: [PATCH 2/4] Restyled by astyle

---
 src/layer/x86/cast_x86.cpp                     |  8 ++++----
 src/layer/x86/convolution_1x1_pack8.h          | 14 +++++++-------
 src/layer/x86/convolution_1x1_pack8_fp16.h     | 14 +++++++-------
 src/layer/x86/convolution_2x2_pack8.h          |  2 +-
 src/layer/x86/convolution_2x2_pack8_fp16.h     |  2 +-
 src/layer/x86/convolution_3x3_pack8.h          | 13 +++++++------
 src/layer/x86/convolution_3x3_pack8_fp16.h     | 13 +++++++------
 src/layer/x86/convolution_x86.cpp              | 18 +++++++++---------
 .../x86/convolutiondepthwise_3x3_pack8_fp16.h  |  4 ++--
 src/layer/x86/convolutiondepthwise_x86.cpp     |  6 +++---
 src/layer/x86/innerproduct_x86.cpp             |  8 ++++----
 11 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp
index 6c3ba6ad6103..efb9df78758f 100644
--- a/src/layer/x86/cast_x86.cpp
+++ b/src/layer/x86/cast_x86.cpp
@@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 4 && type_to == 1)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 1 && type_to == 4)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h
index e75a3c107e3d..d832d4b78d2e 100644
--- a/src/layer/x86/convolution_1x1_pack8.h
+++ b/src/layer/x86/convolution_1x1_pack8.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         }
 
         remain_size_start += nn_size << 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
     }
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h
index 5642e4a210d2..eb86507ff2fc 100644
--- a/src/layer/x86/convolution_1x1_pack8_fp16.h
+++ b/src/layer/x86/convolution_1x1_pack8_fp16.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         }
 
         remain_size_start += nn_size << 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
     }
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h
index 0e59af34ec3e..79e6716acf46 100644
--- a/src/layer/x86/convolution_2x2_pack8.h
+++ b/src/layer/x86/convolution_2x2_pack8.h
@@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     int outch = top_blob.c;
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h
index 168bc77da425..68bbcfe02de7 100644
--- a/src/layer/x86/convolution_2x2_pack8_fp16.h
+++ b/src/layer/x86/convolution_2x2_pack8_fp16.h
@@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     int outch = top_blob.c;
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h
index a4bc42bda87e..923779d11a0f 100644
--- a/src/layer/x86/convolution_3x3_pack8.h
+++ b/src/layer/x86/convolution_3x3_pack8.h
@@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}};
+        {0.0f, 0.0f, 1.0f}
+    };
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -624,7 +625,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1332,7 +1333,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h
index 0bd0f5047588..8d372d0c136b 100644
--- a/src/layer/x86/convolution_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolution_3x3_pack8_fp16.h
@@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}};
+        {0.0f, 0.0f, 1.0f}
+    };
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -623,7 +624,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // permute end
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1331,7 +1332,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 95ff6f2c5071..923b54d54964 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -686,7 +686,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -775,7 +775,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -833,7 +833,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -920,7 +920,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
     use_winograd3x3_int8 = false;
 
     if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1
-        && num_input >= 16 && num_output >= 16)
+            && num_input >= 16 && num_output >= 16)
     {
         // winograd is slow on small channel count
         use_winograd3x3_int8 = true;
@@ -1072,7 +1072,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
 //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
 
 // requantize, reverse scale inplace
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1122,7 +1122,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
 //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
 
 // dequantize, reverse scale inplace
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             if (inner_top_blob.empty())
                 return -100;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c++)
             {
                 float* outptr = inner_bottom_blob.channel(c);
@@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             opt_g.blob_allocator = inner_top_blob.allocator;
             convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c++)
             {
                 float* outptr = (float*)top_blob.channel(c) + x * outw + y;
diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
index b23ea4b41ffe..41e181352b5f 100644
--- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
@@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index c9b624161895..ab9325d87cbf 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     }
                 }
 
-#pragma omp parallel for num_threads(opt.num_threads)
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int g = 0; g < channels; g++)
                 {
                     float* outptr = top_blob.channel(g);
@@ -505,7 +505,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
         const int channels_g = channels / group;
 
 // quantize, scale and round to nearest
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g = 0; g < group; g++)
         {
             Option opt_g = opt;
@@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
     const int channels_g = channels / group;
     const int num_output_g = num_output / group;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 0135fb26f157..80af85fa5889 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -243,7 +243,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
     }
 
     // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;
@@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -397,7 +397,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
     }
 
     // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;

From aec7dcdb88fecc9194ac75377741428a29ef2c44 Mon Sep 17 00:00:00 2001
From: "Restyled.io" <commits@restyled.io>
Date: Wed, 24 Jun 2020 09:11:04 +0000
Subject: [PATCH 3/4] Restyled by clang-format

---
 src/layer/x86/cast_x86.cpp                    |  8 ++---
 src/layer/x86/convolution_1x1_pack8.h         | 14 ++++----
 src/layer/x86/convolution_1x1_pack8_fp16.h    | 14 ++++----
 src/layer/x86/convolution_2x2_pack8.h         |  2 +-
 src/layer/x86/convolution_2x2_pack8_fp16.h    |  2 +-
 src/layer/x86/convolution_3x3_pack8.h         | 13 ++++---
 src/layer/x86/convolution_3x3_pack8_fp16.h    | 13 ++++---
 src/layer/x86/convolution_x86.cpp             | 34 +++++++++----------
 .../x86/convolutiondepthwise_3x3_pack8_fp16.h |  4 +--
 src/layer/x86/convolutiondepthwise_x86.cpp    |  8 ++---
 src/layer/x86/innerproduct_x86.cpp            | 12 +++----
 11 files changed, 61 insertions(+), 63 deletions(-)

diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp
index efb9df78758f..6c3ba6ad6103 100644
--- a/src/layer/x86/cast_x86.cpp
+++ b/src/layer/x86/cast_x86.cpp
@@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 4 && type_to == 1)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 1 && type_to == 4)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h
index d832d4b78d2e..e75a3c107e3d 100644
--- a/src/layer/x86/convolution_1x1_pack8.h
+++ b/src/layer/x86/convolution_1x1_pack8.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         }
 
         remain_size_start += nn_size << 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
     }
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h
index eb86507ff2fc..5642e4a210d2 100644
--- a/src/layer/x86/convolution_1x1_pack8_fp16.h
+++ b/src/layer/x86/convolution_1x1_pack8_fp16.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         }
 
         remain_size_start += nn_size << 1;
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
     }
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h
index 79e6716acf46..0e59af34ec3e 100644
--- a/src/layer/x86/convolution_2x2_pack8.h
+++ b/src/layer/x86/convolution_2x2_pack8.h
@@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     int outch = top_blob.c;
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h
index 68bbcfe02de7..168bc77da425 100644
--- a/src/layer/x86/convolution_2x2_pack8_fp16.h
+++ b/src/layer/x86/convolution_2x2_pack8_fp16.h
@@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     int outch = top_blob.c;
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h
index 923779d11a0f..a4bc42bda87e 100644
--- a/src/layer/x86/convolution_3x3_pack8.h
+++ b/src/layer/x86/convolution_3x3_pack8.h
@@ -26,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+        {0.0f, 0.0f, 1.0f}};
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -299,7 +298,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -486,7 +485,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -625,7 +624,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1333,7 +1332,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h
index 8d372d0c136b..0bd0f5047588 100644
--- a/src/layer/x86/convolution_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolution_3x3_pack8_fp16.h
@@ -26,10 +26,9 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}
-    };
+        {0.0f, 0.0f, 1.0f}};
 
-    #pragma omp parallel for
+#pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -299,7 +298,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -486,7 +485,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -624,7 +623,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // permute end
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1332,7 +1331,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 923b54d54964..c7c3d006a653 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -685,8 +685,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-// num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -774,8 +774,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-// num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -832,8 +832,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-// num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -919,8 +919,8 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }
         else
         {
-// num_output
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // num_output
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
     use_winograd3x3_int8 = false;
 
     if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1
-            && num_input >= 16 && num_output >= 16)
+        && num_input >= 16 && num_output >= 16)
     {
         // winograd is slow on small channel count
         use_winograd3x3_int8 = true;
@@ -1069,10 +1069,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         if (use_winograd3x3_int8)
         {
             conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
-//             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
+            //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
 
-// requantize, reverse scale inplace
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // requantize, reverse scale inplace
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1119,10 +1119,10 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         if (use_winograd3x3_int8)
         {
             conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
-//             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
+            //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
 
-// dequantize, reverse scale inplace
-            #pragma omp parallel for num_threads(opt.num_threads)
+            // dequantize, reverse scale inplace
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             if (inner_top_blob.empty())
                 return -100;
 
-            #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c++)
             {
                 float* outptr = inner_bottom_blob.channel(c);
@@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             opt_g.blob_allocator = inner_top_blob.allocator;
             convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
 
-            #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c++)
             {
                 float* outptr = (float*)top_blob.channel(c) + x * outw + y;
diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
index 41e181352b5f..b23ea4b41ffe 100644
--- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
@@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index ab9325d87cbf..03ffb1011f66 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     }
                 }
 
-                #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
                 for (int g = 0; g < channels; g++)
                 {
                     float* outptr = top_blob.channel(g);
@@ -504,8 +504,8 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
 
         const int channels_g = channels / group;
 
-// quantize, scale and round to nearest
-        #pragma omp parallel for num_threads(opt.num_threads)
+        // quantize, scale and round to nearest
+#pragma omp parallel for num_threads(opt.num_threads)
         for (int g = 0; g < group; g++)
         {
             Option opt_g = opt;
@@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
     const int channels_g = channels / group;
     const int num_output_g = num_output / group;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 80af85fa5889..1d7633a549b8 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -242,8 +242,8 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
         }
     }
 
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;
@@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -396,8 +396,8 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
         }
     }
 
-    // num_output
-    #pragma omp parallel for num_threads(opt.num_threads)
+// num_output
+#pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;

From 8831fe58254b29de1131e88c29826bb2615d0157 Mon Sep 17 00:00:00 2001
From: "Restyled.io" <commits@restyled.io>
Date: Wed, 24 Jun 2020 09:11:05 +0000
Subject: [PATCH 4/4] Restyled by astyle

---
 src/layer/x86/cast_x86.cpp                     |  8 ++++----
 src/layer/x86/convolution_1x1_pack8.h          | 14 +++++++-------
 src/layer/x86/convolution_1x1_pack8_fp16.h     | 14 +++++++-------
 src/layer/x86/convolution_2x2_pack8.h          |  2 +-
 src/layer/x86/convolution_2x2_pack8_fp16.h     |  2 +-
 src/layer/x86/convolution_3x3_pack8.h          | 13 +++++++------
 src/layer/x86/convolution_3x3_pack8_fp16.h     | 13 +++++++------
 src/layer/x86/convolution_x86.cpp              | 18 +++++++++---------
 .../x86/convolutiondepthwise_3x3_pack8_fp16.h  |  4 ++--
 src/layer/x86/convolutiondepthwise_x86.cpp     |  6 +++---
 src/layer/x86/innerproduct_x86.cpp             |  8 ++++----
 11 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/src/layer/x86/cast_x86.cpp b/src/layer/x86/cast_x86.cpp
index 6c3ba6ad6103..efb9df78758f 100644
--- a/src/layer/x86/cast_x86.cpp
+++ b/src/layer/x86/cast_x86.cpp
@@ -134,7 +134,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
@@ -166,7 +166,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
         for (int i = 0; i < remain; i++)
             mask.m256i_u32[i] = 0x80000000;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -192,7 +192,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 4 && type_to == 1)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const unsigned short* ptr = bottom_blob.channel(q);
@@ -217,7 +217,7 @@ int Cast_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
     if (type_from == 1 && type_to == 4)
     {
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < channels; q++)
         {
             const float* ptr = bottom_blob.channel(q);
diff --git a/src/layer/x86/convolution_1x1_pack8.h b/src/layer/x86/convolution_1x1_pack8.h
index e75a3c107e3d..d832d4b78d2e 100644
--- a/src/layer/x86/convolution_1x1_pack8.h
+++ b/src/layer/x86/convolution_1x1_pack8.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
         }
 
         remain_size_start += nn_size << 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, con
             }
         }
     }
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_1x1_pack8_fp16.h b/src/layer/x86/convolution_1x1_pack8_fp16.h
index 5642e4a210d2..eb86507ff2fc 100644
--- a/src/layer/x86/convolution_1x1_pack8_fp16.h
+++ b/src/layer/x86/convolution_1x1_pack8_fp16.h
@@ -210,7 +210,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
     {
         int nn_size = size / 12;
         int remain_size_start = nn_size * 12;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = ii * 12;
@@ -251,7 +251,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
         nn_size = (size - remain_size_start) >> 3;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 8;
@@ -288,7 +288,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         remain_size_start += nn_size << 3;
         nn_size = (size - remain_size_start) >> 2;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 4;
@@ -315,7 +315,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         remain_size_start += nn_size << 2;
         nn_size = (size - remain_size_start) >> 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int ii = 0; ii < nn_size; ii++)
         {
             int i = remain_size_start + ii * 2;
@@ -337,7 +337,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         }
 
         remain_size_start += nn_size << 1;
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int i = remain_size_start; i < size; i++)
         {
             const float* img0 = bottom_blob.channel(0);
@@ -353,7 +353,7 @@ static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob
             }
         }
     }
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out = top_blob.channel(p);
@@ -1014,7 +1014,7 @@ static void conv1x1s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     Mat bottom_blob_shrinked;
     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < channels; p++)
     {
         const float* r0 = bottom_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8.h b/src/layer/x86/convolution_2x2_pack8.h
index 0e59af34ec3e..79e6716acf46 100644
--- a/src/layer/x86/convolution_2x2_pack8.h
+++ b/src/layer/x86/convolution_2x2_pack8.h
@@ -20,7 +20,7 @@ static void conv2x2s1_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat
     int outch = top_blob.c;
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_2x2_pack8_fp16.h b/src/layer/x86/convolution_2x2_pack8_fp16.h
index 168bc77da425..68bbcfe02de7 100644
--- a/src/layer/x86/convolution_2x2_pack8_fp16.h
+++ b/src/layer/x86/convolution_2x2_pack8_fp16.h
@@ -201,7 +201,7 @@ static void conv2x2s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, cons
     int outch = top_blob.c;
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = 0; p < outch; p++)
     {
         Mat out0 = top_blob.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8.h b/src/layer/x86/convolution_3x3_pack8.h
index a4bc42bda87e..923779d11a0f 100644
--- a/src/layer/x86/convolution_3x3_pack8.h
+++ b/src/layer/x86/convolution_3x3_pack8.h
@@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, M
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}};
+        {0.0f, 0.0f, 1.0f}
+    };
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -624,7 +625,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
 
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1332,7 +1333,7 @@ static void conv3x3s1_winograd64_pack8_avx(const Mat& bottom_blob, Mat& top_blob
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_3x3_pack8_fp16.h b/src/layer/x86/convolution_3x3_pack8_fp16.h
index 0bd0f5047588..8d372d0c136b 100644
--- a/src/layer/x86/convolution_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolution_3x3_pack8_fp16.h
@@ -26,9 +26,10 @@ static void conv3x3s1_winograd64_transform_kernel_fp16_pack8_avx(const Mat& kern
         {1.0f / 90, -1.0f / 45, 2.0f / 45},
         {1.0f / 45, 1.0f / 90, 1.0f / 180},
         {1.0f / 45, -1.0f / 90, 1.0f / 180},
-        {0.0f, 0.0f, 1.0f}};
+        {0.0f, 0.0f, 1.0f}
+    };
 
-#pragma omp parallel for
+    #pragma omp parallel for
     for (int p = 0; p < outch; p++)
     {
         for (int q = 0; q < inch; q++)
@@ -298,7 +299,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q = 0; q < inch; q++)
         {
             const Mat img0 = bottom_blob_bordered.channel(q);
@@ -485,7 +486,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         else // if (tiles >= 1)
             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int r = 0; r < 64; r++)
         {
             Mat tm2 = bottom_blob_tm2.channel(r);
@@ -623,7 +624,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         // permute end
         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             float* output0_tm = top_blob_tm.channel(p);
@@ -1331,7 +1332,7 @@ static void conv3x3s1_winograd64_fp16_pack8_avx(const Mat& bottom_blob, Mat& top
         int h_tm = outh / 6 * 8;
         const int tiles = w_tm / 8 * h_tm / 8;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < outch; p++)
         {
             const Mat out0_tm = top_blob_tm.channel(p);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index c7c3d006a653..84d3b54719d7 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -686,7 +686,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
             // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -775,7 +775,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
             // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output / out_elempack; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -833,7 +833,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
             // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -920,7 +920,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         else
         {
             // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 float* outptr = top_blob.channel(p);
@@ -1001,7 +1001,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
     use_winograd3x3_int8 = false;
 
     if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1
-        && num_input >= 16 && num_output >= 16)
+            && num_input >= 16 && num_output >= 16)
     {
         // winograd is slow on small channel count
         use_winograd3x3_int8 = true;
@@ -1072,7 +1072,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
             //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data_int8, opt);
 
             // requantize, reverse scale inplace
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1122,7 +1122,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
             //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data_int8, opt);
 
             // dequantize, reverse scale inplace
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int p = 0; p < num_output; p++)
             {
                 Option opt_g = opt;
@@ -1206,7 +1206,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             if (inner_top_blob.empty())
                 return -100;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < bottom_blob.c; c++)
             {
                 float* outptr = inner_bottom_blob.channel(c);
@@ -1226,7 +1226,7 @@ int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob,
             opt_g.blob_allocator = inner_top_blob.allocator;
             convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
 
-#pragma omp parallel for num_threads(opt.num_threads)
+            #pragma omp parallel for num_threads(opt.num_threads)
             for (int c = 0; c < num_output; c++)
             {
                 float* outptr = (float*)top_blob.channel(c) + x * outw + y;
diff --git a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
index b23ea4b41ffe..41e181352b5f 100644
--- a/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
+++ b/src/layer/x86/convolutiondepthwise_3x3_pack8_fp16.h
@@ -23,7 +23,7 @@ static void convdw3x3s1_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
@@ -234,7 +234,7 @@ static void convdw3x3s2_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, co
 
     const float* bias = _bias;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         Mat out = top_blob.channel(g);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index 03ffb1011f66..fafca48e3e1a 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -367,7 +367,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     }
                 }
 
-#pragma omp parallel for num_threads(opt.num_threads)
+                #pragma omp parallel for num_threads(opt.num_threads)
                 for (int g = 0; g < channels; g++)
                 {
                     float* outptr = top_blob.channel(g);
@@ -505,7 +505,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
         const int channels_g = channels / group;
 
         // quantize, scale and round to nearest
-#pragma omp parallel for num_threads(opt.num_threads)
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int g = 0; g < group; g++)
         {
             Option opt_g = opt;
@@ -619,7 +619,7 @@ int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_
     const int channels_g = channels / group;
     const int num_output_g = num_output / group;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int g = 0; g < group; g++)
     {
         const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel_range(channels_g * g, channels_g);
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 1d7633a549b8..ab551332749c 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -131,7 +131,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -243,7 +243,7 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob,
     }
 
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;
@@ -310,7 +310,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
     int nn_num_output = num_output >> 3;
     int remain_num_output_start = nn_num_output << 3;
 
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_num_output; pp++)
     {
         int p = pp * 8;
@@ -397,7 +397,7 @@ int InnerProduct_x86::forward_fp16(const Mat& bottom_blob, Mat& top_blob,
     }
 
 // num_output
-#pragma omp parallel for num_threads(opt.num_threads)
+    #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_num_output_start; p < num_output; p++)
     {
         float sum = 0.f;