From 6af9bb937ce94ac72ef11cb0ad510614ede3fda6 Mon Sep 17 00:00:00 2001 From: Evan Nemerson Date: Sat, 16 Mar 2019 19:03:24 -0700 Subject: [PATCH] Add OpenMP SIMD pragmas to portable path. The omp simd pragma is supported by OpenMP 4.0+, but several compilers also offer the SIMD portion of OpenMP behind a separate flag since it doesn't require OpenMP support at runtime. If you want to use these pragmas without enabling full OpenMP support, make sure to define _ENABLE_OPENMP_SIMD at compile time. In my own extremely limited testing, this patch results in a roughly 25% performance increase (with GCC 8.3 with -O3 -march=native on a Xeon E3-1225 v3 running Fedora 29). It's still vastly slower than the hand-optimized AVX2 code path, but this should be portable. I haven't really spent much time optimizing this yet, there is probably a fair amount of room for improvement by adding safelen and aligned clauses where appropriate. --- src/facedetectcnn.cpp | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/facedetectcnn.cpp b/src/facedetectcnn.cpp index 1977c2f..d5b4400 100644 --- a/src/facedetectcnn.cpp +++ b/src/facedetectcnn.cpp @@ -49,6 +49,12 @@ the use of this software, even if advised of the possibility of such damage. #define SSE_256ELEMENT(vec, idx) vec[(idx)] #endif +#if !defined(_ENABLE_OPENMP_SIMD) && ((defined(_OPENMP) && (_OPENMP >= 201307L))) +# define _ENABLE_OPENMP_SIMD +#elif defined(__cilk) +# define _ENABLE_CILKPLUS +#endif + typedef struct NormalizedBBox_ { float xmin; @@ -138,6 +144,9 @@ inline float dotProductFloatChGeneral(float* p1, float * p2, int num, int length #else float sum = 0; +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd reduction(+:sum) +#endif for (int i = 0; i < num; i++) { sum += (p1[i] * p2[i]); @@ -220,6 +229,9 @@ inline int dotProductInt8ChGeneral(signed char * p1, signed char * p2, int num, int sum = 0; +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd reduction(+:sum) +#endif for (int i = 0; i < num; i++) { sum += ( int(p1[i]) * int(p2[i])); @@ -417,7 +429,6 @@ bool convertFloat2Int8(CDataBlob * dataBlob) #endif float scale = 1.f; - float tmp; if (dataBlob->int8_data_valid) return true; @@ -438,8 +449,12 @@ bool convertFloat2Int8(CDataBlob * dataBlob) } #else +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd reduction(max:maxval) +#endif for (int ch = 0; ch < dataBlob->channels; ch++) { + float tmp; //tmp = fabs(pF[ch]); //maxval = MAX(maxval, tmp); tmp = pF[ch]; @@ -475,7 +490,6 @@ bool convertFloat2Int8(CDataBlob * dataBlob) #endif for (int row = 0; row < dataBlob->height; row++) { - float tmp; for (int col = 0; col < dataBlob->width; col++) { float * pF = (dataBlob->data_float + (row*dataBlob->width + col)*dataBlob->floatChannelStepInByte / sizeof(float)); @@ -484,6 +498,7 @@ bool convertFloat2Int8(CDataBlob * dataBlob) #if defined(_ENABLE_NEON) for (int ch = 0; ch < dataBlob->channels; ch+=4) { + float tmp; float32x4_t a = vld1q_f32(pF + ch); float32x4_t resultvec = vmulq_f32(a, scalevec); @@ -504,8 +519,12 @@ bool convertFloat2Int8(CDataBlob * dataBlob) pI[ch+3] = (signed char)(tmp + ((tmp>0) - 0.5f)); } #else +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd +#endif for (int ch = 0; ch < dataBlob->channels; ch++) { + float tmp; //pI[ch] = (signed char)round(pF[ch] * scale); //to speedup round() using the following code tmp = pF[ch]; @@ -732,6 +751,9 @@ bool maxpooling2x2S2(const CDataBlob *inputData, CDataBlob *outputData) for (int ch = 0; ch < outputData->channels; ch++) { float maxval = pIn[ch + inputMatOffsetsInElement[0]]; +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd reduction(max:maxval) +#endif for (int el = 1; el < elementCount; el++) { maxval = MAX(maxval, pIn[ch + inputMatOffsetsInElement[el]]); @@ -832,6 +854,9 @@ bool scale(CDataBlob * dataBlob, float scale) } #else +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd +#endif for (int ch = 0; ch < dataBlob->channels; ch++) { pF[ch] *= scale; @@ -879,6 +904,9 @@ bool relu(const CDataBlob *inputOutputData) _mm256_store_ps(pData + ch, a); } #else +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd +#endif for (int ch = 0; ch < inputOutputData->channels; ch++) pData[ch] = MAX(pData[ch], 0); #endif @@ -1010,10 +1038,16 @@ bool normalize(CDataBlob * inputOutputData, float * pScale) } #else +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd reduction(+:sum) +#endif for (int ch = 0; ch < inputOutputData->channels; ch++) sum += (pData[ch] * pData[ch]); s = 1.0f/sqrt(sum); +#if defined(_ENABLE_OPENMP_SIMD) +#pragma omp simd +#endif for (int ch = 0; ch < inputOutputData->channels; ch++) pData[ch] = pData[ch] * pScale[ch] * s; #endif