-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT ARM64-SVE: Add Sve.LoadVector*ZeroExtendTo*() #101291
Conversation
Add the following APIs: LoadVectorByteZeroExtendToInt16 LoadVectorByteZeroExtendToInt32 LoadVectorByteZeroExtendToInt64 LoadVectorByteZeroExtendToUInt16 LoadVectorByteZeroExtendToUInt32 LoadVectorByteZeroExtendToUInt64 LoadVectorInt16SignExtendToInt32 LoadVectorInt16SignExtendToInt64 LoadVectorInt16SignExtendToUInt32 LoadVectorInt16SignExtendToUInt64 LoadVectorInt32SignExtendToInt64 LoadVectorInt32SignExtendToUInt64 LoadVectorSByteSignExtendToInt16 LoadVectorSByteSignExtendToInt32 LoadVectorSByteSignExtendToInt64 LoadVectorSByteSignExtendToUInt16 LoadVectorSByteSignExtendToUInt32 LoadVectorSByteSignExtendToUInt64 LoadVectorUInt16ZeroExtendToInt32 LoadVectorUInt16ZeroExtendToInt64 LoadVectorUInt16ZeroExtendToUInt32 LoadVectorUInt16ZeroExtendToUInt64 LoadVectorUInt32ZeroExtendToInt64 LoadVectorUInt32ZeroExtendToUInt64
Note regarding the
|
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics |
@dotnet-policy-service agree company="Arm" |
@kunalspathak @dotnet/arm64-contrib @a74nh |
Contributes to #99957. |
/// <summary> | ||
/// svint16_t svld1ub_s16(svbool_t pg, const uint8_t *base) | ||
/// LD1B Zresult.H, Pg/Z, [Xarray, Xindex] | ||
/// LD1B Zresult.H, Pg/Z, [Xbase, #0, MUL VL] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you please confirm which of the ld1*
instruction this API maps to? Is it https://docsmirror.github.io/A64/2023-06/ld1b_z_p_ai.html?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe it's https://docsmirror.github.io/A64/2023-06/ld1b_z_p_bi.html . @a74nh, could you confirm this JIC?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The disassembly gives ld1b { z16.h }, p7/z, [x0]
:
Beginning scenario: RunBasicScenario_Load
10:50:41.891 Passed test: _Sve_ro::JIT.HardwareIntrinsics.Arm._Sve.Program.SveLoadVector_ulong()
10:50:41.892 Running test: _Sve_ro::JIT.HardwareIntrinsics.Arm._Sve.Program.SveLoadVectorByteZeroExtendToInt16()
; Assembly listing for method JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16:RunBasicScenario_Load():this (Tier0)
; Emitting BLENDED_CODE for generic ARM64 - Unix
; Tier0 code
; fp based frame
; partially interruptible
; Final local variable assignments
;
; V00 this [V00 ] ( 1, 1 ) ref -> [fp+0x48] do-not-enreg[] this class-hnd <JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16>
; V01 loc0 [V01 ] ( 1, 1 ) simd16 -> [fp+0x30] HFA(simd16) do-not-enreg[S] must-init <System.Numerics.Vector`1[short]>
; V02 loc1 [V02 ] ( 1, 1 ) simd16 -> [fp+0x20] HFA(simd16) do-not-enreg[S] must-init <System.Numerics.Vector`1[short]>
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V04 tmp1 [V04 ] ( 1, 1 ) long -> [fp+0x18] do-not-enreg[] "non-inline candidate call"
; V05 tmp2 [V05 ] ( 1, 1 ) long -> [fp+0x10] do-not-enreg[] "argument with side effect"
;
; Lcl frame size = 64
G_M30813_IG01: ;; offset=0x0000
stp fp, lr, [sp, #-0x50]!
mov fp, sp
str xzr, [fp, #0x30] // [V01 loc0]
str xzr, [fp, #0x38] // [V01 loc0+0x08]
str xzr, [fp, #0x20] // [V02 loc1]
str xzr, [fp, #0x28] // [V02 loc1+0x08]
str x0, [fp, #0x48] // [V00 this]
;; size=28 bbWeight=1 PerfScore 6.50
G_M30813_IG02: ;; offset=0x001C
movz x0, #0xBE18
movk x0, #0x8423 LSL #16
movk x0, #0xFFFF LSL #32
movz x1, #0x7438 // code for TestLibrary.TestFramework:BeginScenario(System.String)
movk x1, #0x3AB9 LSL #16
movk x1, #0xFFFF LSL #32
ldr x1, [x1]
blr x1
ptrue p7.h
mov z16.h, p7/z, #1
str q16, [fp, #0x30] // [V01 loc0]
ldr x0, [fp, #0x48] // [V00 this]
ldrsb wzr, [x0]
ldr x0, [fp, #0x48] // [V00 this]
add x0, x0, #16
movz x1, #0x7DE0 // code for JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16+DataTable:get_inArray1Ptr():ulong:this
movk x1, #0x3AD1 LSL #16
movk x1, #0xFFFF LSL #32
ldr x1, [x1]
blr x1
ldr q16, [fp, #0x30] // [V01 loc0]
ptrue p7.h
cmpne p7.h, p7/z, z16.h, #0
ld1b { z16.h }, p7/z, [x0]
str q16, [fp, #0x20] // [V02 loc1]
ldr x0, [fp, #0x48] // [V00 this]
ldrsb wzr, [x0]
ldr x0, [fp, #0x48] // [V00 this]
add x0, x0, #16
movz x1, #0x7DF8 // code for JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16+DataTable:get_outArrayPtr():ulong:this
movk x1, #0x3AD1 LSL #16
movk x1, #0xFFFF LSL #32
ldr x1, [x1]
blr x1
ldr q16, [fp, #0x20] // [V02 loc1]
str q16, [x0]
ldr x0, [fp, #0x48] // [V00 this]
ldrsb wzr, [x0]
ldr x0, [fp, #0x48] // [V00 this]
add x0, x0, #16
movz x1, #0x7DE0 // code for JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16+DataTable:get_inArray1Ptr():ulong:this
movk x1, #0x3AD1 LSL #16
movk x1, #0xFFFF LSL #32
ldr x1, [x1]
blr x1
str x0, [fp, #0x18] // [V04 tmp1]
ldr x0, [fp, #0x48] // [V00 this]
ldrsb wzr, [x0]
ldr x0, [fp, #0x48] // [V00 this]
add x0, x0, #16
movz x1, #0x7DF8 // code for JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16+DataTable:get_outArrayPtr():ulong:this
movk x1, #0x3AD1 LSL #16
movk x1, #0xFFFF LSL #32
ldr x1, [x1]
blr x1
str x0, [fp, #0x10] // [V05 tmp2]
ldr x2, [fp, #0x10] // [V05 tmp2]
ldr x1, [fp, #0x18] // [V04 tmp1]
ldr x0, [fp, #0x48] // [V00 this]
movz x3, #0xBE18
movk x3, #0x8423 LSL #16
movk x3, #0xFFFF LSL #32
movz x4, #0x7EE8 // code for JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16:ValidateResult(ulong,ulong,System.String):this
movk x4, #0x3AD1 LSL #16
movk x4, #0xFFFF LSL #32
ldr x4, [x4]
blr x4
;; size=268 bbWeight=1 PerfScore 98.00
G_M30813_IG03: ;; offset=0x0128
ldp fp, lr, [sp], #0x50
ret lr
;; size=8 bbWeight=1 PerfScore 2.00
; Total bytes of code 304, prolog size 24, PerfScore 106.50, instruction count 76, allocated bytes for code 304 (MethodHash=c5fe87a2) for method JIT.HardwareIntrinsics.Arm._Sve.LoadUnaryOpTest__SveLoadVectorByteZeroExtendToInt16:RunBasicScenario_Load():this (Tier0)
; ============================================================
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The disassembly gives
ld1b { z16.h }, p7/z, [x0]
:
Just 1 register address. Due to the shortcut in emitInsSve_R_R_R()
, it drops into emitIns_R_R_R_I
using an immediate of 0. Which means.....
I believe it's https://docsmirror.github.io/A64/2023-06/ld1b_z_p_bi.html . @a74nh, could you confirm this JIC?
.....Yes, that one. Contiguous load unsigned bytes to vector (immediate index).
So, we only want the one in the API file:
/// LD1B Zresult.H, Pg/Z, [Xbase, #0, MUL VL]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that entry from all the API documentation should go away.
("SveLoadMaskedUnOpTest.template", new Dictionary<string, string> {["TestName"] = "SveLoadVectorSByteSignExtendToUInt32", ["Isa"] = "Sve", ["Method"] = "LoadVectorSByteSignExtendToUInt32", ["RetVectorType"] = "Vector", ["RetBaseType"] = "UInt32", ["Op1VectorType"] = "Vector", ["Op1BaseType"] = "UInt32", ["Op2BaseType"] = "SByte", ["LargestVectorSize"] = "8", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["ValidateIterResult"] = "firstOp[i] != result[i]"}), | ||
("SveLoadMaskedUnOpTest.template", new Dictionary<string, string> {["TestName"] = "SveLoadVectorSByteSignExtendToUInt64", ["Isa"] = "Sve", ["Method"] = "LoadVectorSByteSignExtendToUInt64", ["RetVectorType"] = "Vector", ["RetBaseType"] = "UInt64", ["Op1VectorType"] = "Vector", ["Op1BaseType"] = "UInt64", ["Op2BaseType"] = "SByte", ["LargestVectorSize"] = "8", ["NextValueOp2"] = "TestLibrary.Generator.GetSByte()", ["ValidateIterResult"] = "(ulong)firstOp[i] != result[i]"}), | ||
("SveLoadMaskedUnOpTest.template", new Dictionary<string, string> {["TestName"] = "SveLoadVectorUInt16ZeroExtendToInt32", ["Isa"] = "Sve", ["Method"] = "LoadVectorUInt16ZeroExtendToInt32", ["RetVectorType"] = "Vector", ["RetBaseType"] = "Int32", ["Op1VectorType"] = "Vector", ["Op1BaseType"] = "Int32", ["Op2BaseType"] = "UInt16", ["LargestVectorSize"] = "8", ["NextValueOp2"] = "TestLibrary.Generator.GetUInt16()", ["ValidateIterResult"] = "firstOp[i] != result[i]"}), | ||
("SveLoadMaskedUnOpTest.template", new Dictionary<string, string> {["TestName"] = "SveLoadVectorUInt16ZeroExtendToInt64", ["Isa"] = "Sve", ["Method"] = "LoadVectorUInt16ZeroExtendToInt64", ["RetVectorType"] = "Vector", ["RetBaseType"] = "Int64", ["Op1VectorType"] = "Vector", ["Op1BaseType"] = "Int64", ["Op2BaseType"] = "UInt16", ["LargestVectorSize"] = "8", ["NextValueOp2"] = "TestLibrary.Generator.GetUInt16()", ["ValidateIterResult"] = "firstOp[i] != result[i]"}), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the comment for LoadVector
as well - but it seems that the mask
vector gets populated with "TestLibrary.Generator.Get*()
which is mostly non-zero. We should also add a way to have in-active elements in the mask and make sure that they remain untouched.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also add a way to have in-active elements in the mask and make sure that they remain untouched.
Agreed. the tricky part of creating the mask and knowing which elements are set. We could use the API methods, but most haven't been implemented yet.
Alternatively - add a helper which constructs a Vector filled randomly with 0 and 1 elements. The implicit mask to vector conversions mean that this can be used for the load mask. Then validateResult needs passing the mask so it can check the 0 entries in the mask match to 0s in the result.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also add a way to have in-active elements in the mask and make sure that they remain untouched.
Agreed. the tricky part of creating the mask and knowing which elements are set. We could use the API methods, but most haven't been implemented yet.
This is how I created it and that is giving good test coverage - https://github.com/dotnet/runtime/pull/100743/files#diff-fd9c6bd33b62670bf0ba80ff74092bd4abfa7d8fb85b0b729a570289531e39d4R181
Alternatively - add a helper which constructs a Vector filled randomly with 0 and 1 elements. The implicit mask to vector conversions mean that this can be used for the load mask. Then validateResult needs passing the mask so it can check the 0 entries in the mask match to 0s in the result.
Yes, pretty much.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
created #101475
@JulieLeeMSFT I think only Kunal has the permissions to do this. |
Remove comments that mentions instuctions that APIs are never mapped to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you please fix the summary docs and the formatting errors. I think it should be good after that.
Done, please check. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks for your contribution!
I have updated this PR to resolve merge conflicts and mark the APIs as |
* JIT ARM64-SVE: Add Sve.LoadVector*ZeroExtendTo*() Add the following APIs: LoadVectorByteZeroExtendToInt16 LoadVectorByteZeroExtendToInt32 LoadVectorByteZeroExtendToInt64 LoadVectorByteZeroExtendToUInt16 LoadVectorByteZeroExtendToUInt32 LoadVectorByteZeroExtendToUInt64 LoadVectorInt16SignExtendToInt32 LoadVectorInt16SignExtendToInt64 LoadVectorInt16SignExtendToUInt32 LoadVectorInt16SignExtendToUInt64 LoadVectorInt32SignExtendToInt64 LoadVectorInt32SignExtendToUInt64 LoadVectorSByteSignExtendToInt16 LoadVectorSByteSignExtendToInt32 LoadVectorSByteSignExtendToInt64 LoadVectorSByteSignExtendToUInt16 LoadVectorSByteSignExtendToUInt32 LoadVectorSByteSignExtendToUInt64 LoadVectorUInt16ZeroExtendToInt32 LoadVectorUInt16ZeroExtendToInt64 LoadVectorUInt16ZeroExtendToUInt32 LoadVectorUInt16ZeroExtendToUInt64 LoadVectorUInt32ZeroExtendToInt64 LoadVectorUInt32ZeroExtendToUInt64 * cleanup: remove unwatnted comments Remove comments that mentions instuctions that APIs are never mapped to. * fix merge conflict * fix merge conflict * fix spacing * Mark LoadVector*Extend* as having HW_Flag_ExplicitMaskedOperation --------- Co-authored-by: Kunal Pathak <[email protected]>
* JIT ARM64-SVE: Add Sve.LoadVector*ZeroExtendTo*() Add the following APIs: LoadVectorByteZeroExtendToInt16 LoadVectorByteZeroExtendToInt32 LoadVectorByteZeroExtendToInt64 LoadVectorByteZeroExtendToUInt16 LoadVectorByteZeroExtendToUInt32 LoadVectorByteZeroExtendToUInt64 LoadVectorInt16SignExtendToInt32 LoadVectorInt16SignExtendToInt64 LoadVectorInt16SignExtendToUInt32 LoadVectorInt16SignExtendToUInt64 LoadVectorInt32SignExtendToInt64 LoadVectorInt32SignExtendToUInt64 LoadVectorSByteSignExtendToInt16 LoadVectorSByteSignExtendToInt32 LoadVectorSByteSignExtendToInt64 LoadVectorSByteSignExtendToUInt16 LoadVectorSByteSignExtendToUInt32 LoadVectorSByteSignExtendToUInt64 LoadVectorUInt16ZeroExtendToInt32 LoadVectorUInt16ZeroExtendToInt64 LoadVectorUInt16ZeroExtendToUInt32 LoadVectorUInt16ZeroExtendToUInt64 LoadVectorUInt32ZeroExtendToInt64 LoadVectorUInt32ZeroExtendToUInt64 * cleanup: remove unwatnted comments Remove comments that mentions instuctions that APIs are never mapped to. * fix merge conflict * fix merge conflict * fix spacing * Mark LoadVector*Extend* as having HW_Flag_ExplicitMaskedOperation --------- Co-authored-by: Kunal Pathak <[email protected]>
Add the following APIs:
LoadVectorByteZeroExtendToInt16
LoadVectorByteZeroExtendToInt32
LoadVectorByteZeroExtendToInt64
LoadVectorByteZeroExtendToUInt16
LoadVectorByteZeroExtendToUInt32
LoadVectorByteZeroExtendToUInt64
LoadVectorInt16SignExtendToInt32
LoadVectorInt16SignExtendToInt64
LoadVectorInt16SignExtendToUInt32
LoadVectorInt16SignExtendToUInt64
LoadVectorInt32SignExtendToInt64
LoadVectorInt32SignExtendToUInt64
LoadVectorSByteSignExtendToInt16
LoadVectorSByteSignExtendToInt32
LoadVectorSByteSignExtendToInt64
LoadVectorSByteSignExtendToUInt16
LoadVectorSByteSignExtendToUInt32
LoadVectorSByteSignExtendToUInt64
LoadVectorUInt16ZeroExtendToInt32
LoadVectorUInt16ZeroExtendToInt64
LoadVectorUInt16ZeroExtendToUInt32
LoadVectorUInt16ZeroExtendToUInt64
LoadVectorUInt32ZeroExtendToInt64
LoadVectorUInt32ZeroExtendToUInt64
Test results, existing APIs starting with
SveLoad
are removed from the output: