Skip to content

Commit

Permalink
Merge pull request #5 from Tencent/restyled/pull-1871
Browse files Browse the repository at this point in the history
Restyled/pull 1871
  • Loading branch information
Timen authored Jun 24, 2020
2 parents fbdde8c + 8831fe5 commit 067f8c5
Show file tree
Hide file tree
Showing 12 changed files with 653 additions and 629 deletions.
3 changes: 2 additions & 1 deletion src/layer/x86/avx_usability.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
#define AVX_USABILITY
#include <immintrin.h>

static inline __m256 loadfp16(const unsigned short* ptr) {
static inline __m256 loadfp16(const unsigned short* ptr)
{
return _mm256_cvtph_ps(_mm_load_si128((__m128i*)(ptr)));
}
static inline __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c)
Expand Down
160 changes: 79 additions & 81 deletions src/layer/x86/convolution_1x1_pack8.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output)
{

// src = kw-kh-inch-outch
// dst = 8b-8a-kw-kh-inch/8a-outch/8b
Mat weight_data_r2 = kernel.reshape(1, num_input, num_output);
Expand Down Expand Up @@ -109,90 +108,89 @@ static void conv1x1s1_sgemm_transform_kernel_pack8_avx(const Mat& kernel, Mat& w
const float* k77 = k7.row(p + 7);

float* g00 = g0.row(p / 8);
g00[0] = k00[0];
g00[1] = k10[0];
g00[2] = k20[0];
g00[3] = k30[0];
g00[4] = k40[0];
g00[5] = k50[0];
g00[6] = k60[0];
g00[7] = k70[0];
g00 += 8;
g00[0] = k01[0];
g00[1] = k11[0];
g00[2] = k21[0];
g00[3] = k31[0];
g00[4] = k41[0];
g00[5] = k51[0];
g00[6] = k61[0];
g00[7] = k71[0];

g00 += 8;
g00[0] = k02[0];
g00[1] = k12[0];
g00[2] = k22[0];
g00[3] = k32[0];
g00[4] = k42[0];
g00[5] = k52[0];
g00[6] = k62[0];
g00[7] = k72[0];

g00 += 8;
g00[0] = k03[0];
g00[1] = k13[0];
g00[2] = k23[0];
g00[3] = k33[0];
g00[4] = k43[0];
g00[5] = k53[0];
g00[6] = k63[0];
g00[7] = k73[0];

g00 += 8;
g00[0] = k04[0];
g00[1] = k14[0];
g00[2] = k24[0];
g00[3] = k34[0];
g00[4] = k44[0];
g00[5] = k54[0];
g00[6] = k64[0];
g00[7] = k74[0];

g00 += 8;
g00[0] = k05[0];
g00[1] = k15[0];
g00[2] = k25[0];
g00[3] = k35[0];
g00[4] = k45[0];
g00[5] = k55[0];
g00[6] = k65[0];
g00[7] = k75[0];

g00 += 8;
g00[0] = k06[0];
g00[1] = k16[0];
g00[2] = k26[0];
g00[3] = k36[0];
g00[4] = k46[0];
g00[5] = k56[0];
g00[6] = k66[0];
g00[7] = k76[0];

g00 += 8;
g00[0] = k07[0];
g00[1] = k17[0];
g00[2] = k27[0];
g00[3] = k37[0];
g00[4] = k47[0];
g00[5] = k57[0];
g00[6] = k67[0];
g00[7] = k77[0];

g00 += 8;
g00[0] = k00[0];
g00[1] = k10[0];
g00[2] = k20[0];
g00[3] = k30[0];
g00[4] = k40[0];
g00[5] = k50[0];
g00[6] = k60[0];
g00[7] = k70[0];
g00 += 8;
g00[0] = k01[0];
g00[1] = k11[0];
g00[2] = k21[0];
g00[3] = k31[0];
g00[4] = k41[0];
g00[5] = k51[0];
g00[6] = k61[0];
g00[7] = k71[0];

g00 += 8;
g00[0] = k02[0];
g00[1] = k12[0];
g00[2] = k22[0];
g00[3] = k32[0];
g00[4] = k42[0];
g00[5] = k52[0];
g00[6] = k62[0];
g00[7] = k72[0];

g00 += 8;
g00[0] = k03[0];
g00[1] = k13[0];
g00[2] = k23[0];
g00[3] = k33[0];
g00[4] = k43[0];
g00[5] = k53[0];
g00[6] = k63[0];
g00[7] = k73[0];

g00 += 8;
g00[0] = k04[0];
g00[1] = k14[0];
g00[2] = k24[0];
g00[3] = k34[0];
g00[4] = k44[0];
g00[5] = k54[0];
g00[6] = k64[0];
g00[7] = k74[0];

g00 += 8;
g00[0] = k05[0];
g00[1] = k15[0];
g00[2] = k25[0];
g00[3] = k35[0];
g00[4] = k45[0];
g00[5] = k55[0];
g00[6] = k65[0];
g00[7] = k75[0];

g00 += 8;
g00[0] = k06[0];
g00[1] = k16[0];
g00[2] = k26[0];
g00[3] = k36[0];
g00[4] = k46[0];
g00[5] = k56[0];
g00[6] = k66[0];
g00[7] = k76[0];

g00 += 8;
g00[0] = k07[0];
g00[1] = k17[0];
g00[2] = k27[0];
g00[3] = k37[0];
g00[4] = k47[0];
g00[5] = k57[0];
g00[6] = k67[0];
g00[7] = k77[0];

g00 += 8;
}
}
}


static void conv1x1s1_sgemm_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
Expand Down
164 changes: 81 additions & 83 deletions src/layer/x86/convolution_1x1_pack8_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// specific language governing permissions and limitations under the License.
static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, Mat& weight_data_pack8, int num_input, int num_output)
{
// src = kw-kh-inch-outch
// src = kw-kh-inch-outch
// dst = 8b-8a-kw-kh-inch/8a-outch/8b
Mat weight_data_r2 = kernel.reshape(1, num_input, num_output);

Expand Down Expand Up @@ -106,92 +106,90 @@ static void conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(const Mat& kernel, M
const float* k76 = k7.row(p + 6);
const float* k77 = k7.row(p + 7);

unsigned short* g00 =(unsigned short*) g0.row(p / 8);
g00[0] = float32_to_float16(k00[0]);
g00[1] = float32_to_float16(k10[0]);
g00[2] = float32_to_float16(k20[0]);
g00[3] = float32_to_float16(k30[0]);
g00[4] = float32_to_float16(k40[0]);
g00[5] = float32_to_float16(k50[0]);
g00[6] = float32_to_float16(k60[0]);
g00[7] = float32_to_float16(k70[0]);
g00 += 8;
g00[0] = float32_to_float16(k01[0]);
g00[1] = float32_to_float16(k11[0]);
g00[2] = float32_to_float16(k21[0]);
g00[3] = float32_to_float16(k31[0]);
g00[4] = float32_to_float16(k41[0]);
g00[5] = float32_to_float16(k51[0]);
g00[6] = float32_to_float16(k61[0]);
g00[7] = float32_to_float16(k71[0]);

g00 += 8;
g00[0] = float32_to_float16(k02[0]);
g00[1] = float32_to_float16(k12[0]);
g00[2] = float32_to_float16(k22[0]);
g00[3] = float32_to_float16(k32[0]);
g00[4] = float32_to_float16(k42[0]);
g00[5] = float32_to_float16(k52[0]);
g00[6] = float32_to_float16(k62[0]);
g00[7] = float32_to_float16(k72[0]);

g00 += 8;
g00[0] = float32_to_float16(k03[0]);
g00[1] = float32_to_float16(k13[0]);
g00[2] = float32_to_float16(k23[0]);
g00[3] = float32_to_float16(k33[0]);
g00[4] = float32_to_float16(k43[0]);
g00[5] = float32_to_float16(k53[0]);
g00[6] = float32_to_float16(k63[0]);
g00[7] = float32_to_float16(k73[0]);

g00 += 8;
g00[0] = float32_to_float16(k04[0]);
g00[1] = float32_to_float16(k14[0]);
g00[2] = float32_to_float16(k24[0]);
g00[3] = float32_to_float16(k34[0]);
g00[4] = float32_to_float16(k44[0]);
g00[5] = float32_to_float16(k54[0]);
g00[6] = float32_to_float16(k64[0]);
g00[7] = float32_to_float16(k74[0]);

g00 += 8;
g00[0] = float32_to_float16(k05[0]);
g00[1] = float32_to_float16(k15[0]);
g00[2] = float32_to_float16(k25[0]);
g00[3] = float32_to_float16(k35[0]);
g00[4] = float32_to_float16(k45[0]);
g00[5] = float32_to_float16(k55[0]);
g00[6] = float32_to_float16(k65[0]);
g00[7] = float32_to_float16(k75[0]);

g00 += 8;
g00[0] = float32_to_float16(k06[0]);
g00[1] = float32_to_float16(k16[0]);
g00[2] = float32_to_float16(k26[0]);
g00[3] = float32_to_float16(k36[0]);
g00[4] = float32_to_float16(k46[0]);
g00[5] = float32_to_float16(k56[0]);
g00[6] = float32_to_float16(k66[0]);
g00[7] = float32_to_float16(k76[0]);

g00 += 8;
g00[0] = float32_to_float16(k07[0]);
g00[1] = float32_to_float16(k17[0]);
g00[2] = float32_to_float16(k27[0]);
g00[3] = float32_to_float16(k37[0]);
g00[4] = float32_to_float16(k47[0]);
g00[5] = float32_to_float16(k57[0]);
g00[6] = float32_to_float16(k67[0]);
g00[7] = float32_to_float16(k77[0]);

g00 += 8;
unsigned short* g00 = (unsigned short*)g0.row(p / 8);
g00[0] = float32_to_float16(k00[0]);
g00[1] = float32_to_float16(k10[0]);
g00[2] = float32_to_float16(k20[0]);
g00[3] = float32_to_float16(k30[0]);
g00[4] = float32_to_float16(k40[0]);
g00[5] = float32_to_float16(k50[0]);
g00[6] = float32_to_float16(k60[0]);
g00[7] = float32_to_float16(k70[0]);
g00 += 8;
g00[0] = float32_to_float16(k01[0]);
g00[1] = float32_to_float16(k11[0]);
g00[2] = float32_to_float16(k21[0]);
g00[3] = float32_to_float16(k31[0]);
g00[4] = float32_to_float16(k41[0]);
g00[5] = float32_to_float16(k51[0]);
g00[6] = float32_to_float16(k61[0]);
g00[7] = float32_to_float16(k71[0]);

g00 += 8;
g00[0] = float32_to_float16(k02[0]);
g00[1] = float32_to_float16(k12[0]);
g00[2] = float32_to_float16(k22[0]);
g00[3] = float32_to_float16(k32[0]);
g00[4] = float32_to_float16(k42[0]);
g00[5] = float32_to_float16(k52[0]);
g00[6] = float32_to_float16(k62[0]);
g00[7] = float32_to_float16(k72[0]);

g00 += 8;
g00[0] = float32_to_float16(k03[0]);
g00[1] = float32_to_float16(k13[0]);
g00[2] = float32_to_float16(k23[0]);
g00[3] = float32_to_float16(k33[0]);
g00[4] = float32_to_float16(k43[0]);
g00[5] = float32_to_float16(k53[0]);
g00[6] = float32_to_float16(k63[0]);
g00[7] = float32_to_float16(k73[0]);

g00 += 8;
g00[0] = float32_to_float16(k04[0]);
g00[1] = float32_to_float16(k14[0]);
g00[2] = float32_to_float16(k24[0]);
g00[3] = float32_to_float16(k34[0]);
g00[4] = float32_to_float16(k44[0]);
g00[5] = float32_to_float16(k54[0]);
g00[6] = float32_to_float16(k64[0]);
g00[7] = float32_to_float16(k74[0]);

g00 += 8;
g00[0] = float32_to_float16(k05[0]);
g00[1] = float32_to_float16(k15[0]);
g00[2] = float32_to_float16(k25[0]);
g00[3] = float32_to_float16(k35[0]);
g00[4] = float32_to_float16(k45[0]);
g00[5] = float32_to_float16(k55[0]);
g00[6] = float32_to_float16(k65[0]);
g00[7] = float32_to_float16(k75[0]);

g00 += 8;
g00[0] = float32_to_float16(k06[0]);
g00[1] = float32_to_float16(k16[0]);
g00[2] = float32_to_float16(k26[0]);
g00[3] = float32_to_float16(k36[0]);
g00[4] = float32_to_float16(k46[0]);
g00[5] = float32_to_float16(k56[0]);
g00[6] = float32_to_float16(k66[0]);
g00[7] = float32_to_float16(k76[0]);

g00 += 8;
g00[0] = float32_to_float16(k07[0]);
g00[1] = float32_to_float16(k17[0]);
g00[2] = float32_to_float16(k27[0]);
g00[3] = float32_to_float16(k37[0]);
g00[4] = float32_to_float16(k47[0]);
g00[5] = float32_to_float16(k57[0]);
g00[6] = float32_to_float16(k67[0]);
g00[7] = float32_to_float16(k77[0]);

g00 += 8;
}
}
}



static void conv1x1s1_sgemm_fp16_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
int w = bottom_blob.w;
Expand Down
2 changes: 1 addition & 1 deletion src/layer/x86/convolution_2x2_pack8_fp16.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ static void conv2x2s1_weight_fp16_pack8_avx(const Mat& kernel, Mat& kernel_tm_pa
const float* k76 = k7.row(p + 6);
const float* k77 = k7.row(p + 7);

unsigned short* g00 =(unsigned short*) g0.row(p / 8);
unsigned short* g00 = (unsigned short*)g0.row(p / 8);

for (int k = 0; k < 4; k++)
{
Expand Down
1 change: 0 additions & 1 deletion src/layer/x86/convolution_3x3_pack8.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.


static void conv3x3s1_winograd64_transform_kernel_pack8_avx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch)
{
// winograd63 transform kernel
Expand Down
Loading

0 comments on commit 067f8c5

Please sign in to comment.