forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] MihaZupan/runtime/searchvalues-probAvx512Permute #660
Comments
Top method improvements-58 (-26.01 % of base) - System.Buffers.ProbabilisticMap:ContainsMask64CharsAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector512`1[ubyte] ; Assembly listing for method System.Buffers.ProbabilisticMap:ContainsMask64CharsAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
-; V01 arg0 [V01,T13] ( 2, 2 ) simd64 -> mm0 single-def <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+; V01 arg0 [V01,T10] ( 2, 2 ) simd64 -> mm0 single-def <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V02 arg1 [V02,T01] ( 3, 3 ) byref -> rsi single-def
; V03 arg2 [V03,T02] ( 3, 3 ) byref -> rdx single-def
; V04 loc0 [V04,T06] ( 3, 3 ) simd64 -> mm2 <System.Runtime.Intrinsics.Vector512`1[ushort]>
-; V05 loc1 [V05,T07] ( 3, 3 ) simd64 -> mm3 <System.Runtime.Intrinsics.Vector512`1[ubyte]>
+; V05 loc1 [V05,T07] ( 3, 3 ) simd64 -> mm4 <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V06 loc2 [V06,T08] ( 3, 3 ) simd64 -> mm1 <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V07 loc3 [V07 ] ( 0, 0 ) simd64 -> zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V09 tmp1 [V09,T03] ( 3, 6 ) simd64 -> mm1 "impAppendStmt"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd64 -> zero-ref "impAppendStmt"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V12 tmp4 [V12 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V13 tmp5 [V13 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V14 tmp6 [V14 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V15 tmp7 [V15 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V16 tmp8 [V16 ] ( 0, 0 ) simd64 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-; V17 cse0 [V17,T09] ( 3, 3 ) simd64 -> mm3 "CSE #01: aggressive"
-; V18 cse1 [V18,T10] ( 3, 3 ) simd64 -> mm2 "CSE #02: aggressive"
-; V19 cse2 [V19,T11] ( 3, 3 ) simd64 -> mm5 "CSE #03: aggressive"
-; V20 cse3 [V20,T12] ( 3, 3 ) simd64 -> mm6 "CSE #04: aggressive"
-; V21 rat0 [V21,T04] ( 3, 6 ) simd64 -> mm3 "ReplaceWithLclVar is creating a new local variable"
-; V22 rat1 [V22,T05] ( 3, 6 ) simd64 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+; V15 cse0 [V15,T09] ( 3, 3 ) simd64 -> mm4 "CSE #01: aggressive"
+; V16 rat0 [V16,T04] ( 3, 6 ) simd64 -> mm2 "ReplaceWithLclVar is creating a new local variable"
+; V17 rat1 [V17,T05] ( 3, 6 ) simd64 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M31611_IG01:
- push rbp
- mov rbp, rsp
- vmovups zmm0, zmmword ptr [rbp+0x10]
- ;; size=14 bbWeight=1 PerfScore 4.25
+ vmovups zmm0, zmmword ptr [rsp+0x08]
+ ;; size=11 bbWeight=1 PerfScore 3.00
G_M31611_IG02:
vmovups zmm1, zmmword ptr [rsi]
vmovups zmm2, zmmword ptr [rdx]
vmovups zmm3, zmmword ptr [reloc @RWD00]
- vpandd zmm4, zmm3, zmm1
- vpandd zmm3, zmm3, zmm2
- vpackuswb zmm3, zmm4, zmm3
- vpsrlw zmm1, zmm1, 8
- vpsrlw zmm2, zmm2, 8
- vpackuswb zmm1, zmm1, zmm2
- vmovups zmm2, zmmword ptr [reloc @RWD64]
- vpandd zmm4, zmm2, zmm3
- vpermb zmm4, zmm4, zmm0
- vpsrld zmm3, zmm3, 5
- vmovups zmm5, zmmword ptr [reloc @RWD128]
- vpandd zmm3, zmm3, zmm5
- vmovups zmm6, zmmword ptr [reloc @RWD192]
- vpshufb zmm3, zmm6, zmm3
- vpandd zmm3, zmm3, zmm4
- vptestnmb k1, zmm3, zmm3
- vpmovm2b zmm3, k1
- vpandd zmm2, zmm2, zmm1
- vpermb zmm0, zmm2, zmm0
+ vmovaps zmm4, zmm1
+ vpermt2b zmm4, zmm3, zmm2
+ vmovups zmm3, zmmword ptr [reloc @RWD64]
+ vpermt2b zmm1, zmm3, zmm2
+ vpermb zmm2, zmm4, zmm0
+ vpsrld zmm3, zmm4, 5
+ vmovups zmm4, zmmword ptr [reloc @RWD128]
+ vpermb zmm3, zmm3, zmm4
+ vpandd zmm2, zmm3, zmm2
+ vptestnmb k1, zmm2, zmm2
+ vpmovm2b zmm2, k1
+ vpermb zmm0, zmm1, zmm0
vpsrld zmm1, zmm1, 5
- vpandd zmm1, zmm1, zmm5
- vpshufb zmm1, zmm6, zmm1
+ vpermb zmm1, zmm1, zmm4
vpandd zmm0, zmm1, zmm0
vptestnmb k1, zmm0, zmm0
vpmovm2b zmm0, k1
- vpternlogd zmm0, zmm0, zmm3, 17
+ vpternlogd zmm0, zmm0, zmm2, 17
vmovups zmmword ptr [rdi], zmm0
mov rax, rdi
- ;; size=204 bbWeight=1 PerfScore 47.42
+ ;; size=150 bbWeight=1 PerfScore 48.67
G_M31611_IG03:
vzeroupper
- pop rbp
ret
- ;; size=5 bbWeight=1 PerfScore 2.50
-RWD00 dq 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
-RWD64 dq 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh
-RWD128 dq 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h
-RWD192 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
+ ;; size=4 bbWeight=1 PerfScore 2.00
+RWD00 dq 0E0C0A0806040200h, 1E1C1A1816141210h, 2E2C2A2826242220h, 3E3C3A3836343230h, 4E4C4A4846444240h, 5E5C5A5856545250h, 6E6C6A6866646260h, 7E7C7A7876747270h
+RWD64 dq 0F0D0B0907050301h, 1F1D1B1917151311h, 2F2D2B2927252321h, 3F3D3B3937353331h, 4F4D4B4947454341h, 5F5D5B5957555351h, 6F6D6B6967656361h, 7F7D7B7977757371h
+RWD128 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
-; Total bytes of code 223, prolog size 4, PerfScore 54.17, instruction count 37, allocated bytes for code 223 (MethodHash=c5138484) for method System.Buffers.ProbabilisticMap:ContainsMask64CharsAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
+; Total bytes of code 165, prolog size 0, PerfScore 53.67, instruction count 26, allocated bytes for code 165 (MethodHash=c5138484) for method System.Buffers.ProbabilisticMap:ContainsMask64CharsAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; ============================================================ -36 (-22.22 % of base) - System.Buffers.ProbabilisticMap:ContainsMask32CharsAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector256`1[ubyte] ; Assembly listing for method System.Buffers.ProbabilisticMap:ContainsMask32CharsAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
-; V01 arg0 [V01,T11] ( 2, 2 ) simd32 -> mm0 single-def <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V01 arg0 [V01,T08] ( 2, 2 ) simd32 -> mm0 single-def <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V02 arg1 [V02,T01] ( 3, 3 ) byref -> rsi single-def
; V03 arg2 [V03,T02] ( 3, 3 ) byref -> rdx single-def
; V04 loc0 [V04,T04] ( 3, 3 ) simd32 -> mm2 <System.Runtime.Intrinsics.Vector256`1[ushort]>
-; V05 loc1 [V05,T05] ( 3, 3 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
+; V05 loc1 [V05,T05] ( 3, 3 ) simd32 -> mm4 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V06 loc2 [V06,T06] ( 3, 3 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V07 loc3 [V07 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V09 tmp1 [V09,T03] ( 3, 6 ) simd32 -> mm1 "impAppendStmt"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref "impAppendStmt"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V12 tmp4 [V12 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V13 tmp5 [V13 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V14 tmp6 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V15 tmp7 [V15 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V16 tmp8 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-; V17 cse0 [V17,T07] ( 3, 3 ) simd32 -> mm3 "CSE #01: aggressive"
-; V18 cse1 [V18,T08] ( 3, 3 ) simd32 -> mm2 "CSE #02: aggressive"
-; V19 cse2 [V19,T09] ( 3, 3 ) simd32 -> mm5 "CSE #03: aggressive"
-; V20 cse3 [V20,T10] ( 3, 3 ) simd32 -> mm6 "CSE #04: aggressive"
+; V15 cse0 [V15,T07] ( 3, 3 ) simd32 -> mm4 "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M54488_IG01:
- push rbp
- mov rbp, rsp
- vmovups ymm0, ymmword ptr [rbp+0x10]
- ;; size=9 bbWeight=1 PerfScore 5.25
+ vmovups ymm0, ymmword ptr [rsp+0x08]
+ ;; size=6 bbWeight=1 PerfScore 4.00
G_M54488_IG02:
vmovups ymm1, ymmword ptr [rsi]
vmovups ymm2, ymmword ptr [rdx]
vmovups ymm3, ymmword ptr [reloc @RWD00]
- vpand ymm4, ymm3, ymm1
- vpand ymm3, ymm3, ymm2
- vpackuswb ymm3, ymm4, ymm3
- vpsrlw ymm1, ymm1, 8
- vpsrlw ymm2, ymm2, 8
- vpackuswb ymm1, ymm1, ymm2
- vmovups ymm2, ymmword ptr [reloc @RWD32]
- vpand ymm4, ymm2, ymm3
- vpermb ymm4, ymm4, ymm0
- vpsrld ymm3, ymm3, 5
- vmovups ymm5, ymmword ptr [reloc @RWD64]
- vpand ymm3, ymm3, ymm5
- vmovups ymm6, ymmword ptr [reloc @RWD96]
- vpshufb ymm3, ymm6, ymm3
- vpand ymm3, ymm3, ymm4
- vxorps ymm4, ymm4, ymm4
- vpcmpeqb ymm3, ymm4, ymm3
- vpand ymm2, ymm2, ymm1
- vpermb ymm0, ymm2, ymm0
+ vmovaps ymm4, ymm1
+ vpermt2b ymm4, ymm3, ymm2
+ vmovups ymm3, ymmword ptr [reloc @RWD32]
+ vpermt2b ymm1, ymm3, ymm2
+ vpermb ymm2, ymm4, ymm0
+ vpsrld ymm3, ymm4, 5
+ vmovups ymm4, ymmword ptr [reloc @RWD64]
+ vpermb ymm3, ymm3, ymm4
+ vpand ymm2, ymm3, ymm2
+ vxorps ymm3, ymm3, ymm3
+ vpcmpeqb ymm2, ymm3, ymm2
+ vpermb ymm0, ymm1, ymm0
vpsrld ymm1, ymm1, 5
- vpand ymm1, ymm1, ymm5
- vpshufb ymm1, ymm6, ymm1
+ vpermb ymm1, ymm1, ymm4
vpand ymm0, ymm1, ymm0
- vpcmpeqb ymm0, ymm4, ymm0
- vpternlogd ymm0, ymm0, ymm3, 17
+ vpcmpeqb ymm0, ymm3, ymm0
+ vpternlogd ymm0, ymm0, ymm2, 17
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=148 bbWeight=1 PerfScore 52.75
+ ;; size=116 bbWeight=1 PerfScore 49.00
G_M54488_IG03:
vzeroupper
- pop rbp
ret
- ;; size=5 bbWeight=1 PerfScore 2.50
-RWD00 dq 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh, 00FF00FF00FF00FFh
-RWD32 dq 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh, 1F1F1F1F1F1F1F1Fh
-RWD64 dq 0707070707070707h, 0707070707070707h, 0707070707070707h, 0707070707070707h
-RWD96 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
+ ;; size=4 bbWeight=1 PerfScore 2.00
+RWD00 dq 0E0C0A0806040200h, 1E1C1A1816141210h, 2E2C2A2826242220h, 3E3C3A3836343230h
+RWD32 dq 0F0D0B0907050301h, 1F1D1B1917151311h, 2F2D2B2927252321h, 3F3D3B3937353331h
+RWD64 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
-; Total bytes of code 162, prolog size 4, PerfScore 60.50, instruction count 36, allocated bytes for code 162 (MethodHash=a8762b27) for method System.Buffers.ProbabilisticMap:ContainsMask32CharsAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
+; Total bytes of code 126, prolog size 0, PerfScore 55.00, instruction count 25, allocated bytes for code 126 (MethodHash=a8762b27) for method System.Buffers.ProbabilisticMap:ContainsMask32CharsAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],byref,byref):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; ============================================================ -26 (-27.08 % of base) - System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] ; Assembly listing for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T03] ( 1, 1 ) simd64 -> [rsp+0x08] single-def <System.Runtime.Intrinsics.Vector512`1[ubyte]>
; V02 arg1 [V02,T02] ( 2, 2 ) simd64 -> mm0 single-def <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V03 loc0 [V03 ] ( 0, 0 ) simd64 -> zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
;* V04 loc1 [V04 ] ( 0, 0 ) simd64 -> zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;* V05 loc2 [V05 ] ( 0, 0 ) simd64 -> zero-ref <System.Runtime.Intrinsics.Vector512`1[ubyte]>
-;# V06 OutArgs [V06 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V07 rat0 [V07,T01] ( 3, 6 ) simd64 -> mm0 "ReplaceWithLclVar is creating a new local variable"
+;# V05 OutArgs [V05 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V06 rat0 [V06,T01] ( 3, 6 ) simd64 -> mm0 "ReplaceWithLclVar is creating a new local variable"
;
; Lcl frame size = 0
G_M8383_IG01:
vmovups zmm0, zmmword ptr [rsp+0x48]
;; size=11 bbWeight=1 PerfScore 3.00
G_M8383_IG02:
- vpandd zmm1, zmm0, dword ptr [reloc @RWD00] {1to16}
- vpermb zmm1, zmm1, zmmword ptr [rsp+0x08]
+ vpermb zmm1, zmm0, zmmword ptr [rsp+0x08]
vpsrld zmm0, zmm0, 5
- vpandd zmm0, zmm0, dword ptr [reloc @RWD04] {1to16}
- vmovups zmm2, zmmword ptr [reloc @RWD64]
- vpshufb zmm0, zmm2, zmm0
+ vpermb zmm0, zmm0, zmmword ptr [reloc @RWD00]
vpandd zmm0, zmm0, zmm1
vptestnmb k1, zmm0, zmm0
vpmovm2b zmm0, k1
vmovups zmmword ptr [rdi], zmm0
mov rax, rdi
- ;; size=81 bbWeight=1 PerfScore 20.58
+ ;; size=55 bbWeight=1 PerfScore 16.58
G_M8383_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-RWD00 dd 1F1F1F1Fh
-RWD04 dd 07070707h
-RWD08 dd 00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
- dd 00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
- dd 00000000h, 00000000h
-RWD64 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
+RWD00 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
-; Total bytes of code 96, prolog size 0, PerfScore 25.58, instruction count 14, allocated bytes for code 96 (MethodHash=5c3fdf40) for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
+; Total bytes of code 70, prolog size 0, PerfScore 21.58, instruction count 11, allocated bytes for code 70 (MethodHash=5c3fdf40) for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector512`1[ubyte],System.Runtime.Intrinsics.Vector512`1[ubyte]):System.Runtime.Intrinsics.Vector512`1[ubyte] (FullOpts)
; ============================================================ -23 (-29.49 % of base) - System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] ; Assembly listing for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T02] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Runtime.Intrinsics.Vector256`1[ubyte]>
; V02 arg1 [V02,T01] ( 2, 2 ) simd32 -> mm0 single-def <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V03 loc0 [V03 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
;* V04 loc1 [V04 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;* V05 loc2 [V05 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ubyte]>
-;# V06 OutArgs [V06 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;# V05 OutArgs [V05 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M16568_IG01:
vmovups ymm0, ymmword ptr [rsp+0x28]
;; size=6 bbWeight=1 PerfScore 4.00
G_M16568_IG02:
- vpandd ymm1, ymm0, dword ptr [reloc @RWD00] {1to8}
- vpermb ymm1, ymm1, ymmword ptr [rsp+0x08]
+ vpermb ymm1, ymm0, ymmword ptr [rsp+0x08]
vpsrld ymm0, ymm0, 5
- vpandd ymm0, ymm0, dword ptr [reloc @RWD04] {1to8}
- vmovups ymm2, ymmword ptr [reloc @RWD32]
- vpshufb ymm0, ymm2, ymm0
+ vpermb ymm0, ymm0, ymmword ptr [reloc @RWD00]
vpand ymm0, ymm0, ymm1
vxorps ymm1, ymm1, ymm1
vpcmpeqb ymm0, ymm1, ymm0
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=68 bbWeight=1 PerfScore 19.42
+ ;; size=45 bbWeight=1 PerfScore 14.42
G_M16568_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-RWD00 dd 1F1F1F1Fh
-RWD04 dd 07070707h
-RWD08 dd 00000000h, 00000000h, 00000000h, 00000000h, 00000000h, 00000000h
-RWD32 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
+RWD00 dq 8040201008040201h, 8040201008040201h, 8040201008040201h, 8040201008040201h
-; Total bytes of code 78, prolog size 0, PerfScore 25.42, instruction count 14, allocated bytes for code 78 (MethodHash=8d9abf47) for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
+; Total bytes of code 55, prolog size 0, PerfScore 20.42, instruction count 11, allocated bytes for code 55 (MethodHash=8d9abf47) for method System.Buffers.ProbabilisticMap:IsCharBitNotSetAvx512(System.Runtime.Intrinsics.Vector256`1[ubyte],System.Runtime.Intrinsics.Vector256`1[ubyte]):System.Runtime.Intrinsics.Vector256`1[ubyte] (FullOpts)
; ============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 16 minutes 45 seconds.
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: