Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport] 8262355: Support for AVX-512 opmask register allocation. #639

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2495,6 +2495,22 @@ void Assembler::kmovwl(KRegister dst, Address src) {
emit_operand((Register)dst, src);
}

void Assembler::kmovwl(Address dst, KRegister src) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int8((unsigned char)0x91);
emit_operand((Register)src, dst);
}

void Assembler::kmovwl(KRegister dst, KRegister src) {
assert(VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0x90, (0xC0 | encode));
}

void Assembler::kmovdl(KRegister dst, Register src) {
assert(VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
Expand Down Expand Up @@ -2815,6 +2831,22 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool mer
emit_operand(dst, src);
}

void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

void Assembler::evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
Expand Down Expand Up @@ -9438,6 +9470,13 @@ void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XM
emit_int16(0x64, (0xC0 | encode));
}

void Assembler::bzhiq(Register dst, Register src1, Register src2) {
JoshuaZhuwj marked this conversation as resolved.
Show resolved Hide resolved
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xF5, (0xC0 | encode));
}

void Assembler::shlxl(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
Expand Down
5 changes: 5 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1494,6 +1494,8 @@ class Assembler : public AbstractAssembler {
void kmovwl(KRegister dst, Register src);
void kmovwl(KRegister dst, Address src);
void kmovwl(Register dst, KRegister src);
void kmovwl(Address dst, KRegister src);
void kmovwl(KRegister dst, KRegister src);
void kmovdl(KRegister dst, Register src);
void kmovdl(Register dst, KRegister src);
void kmovql(KRegister dst, KRegister src);
Expand Down Expand Up @@ -1542,6 +1544,7 @@ class Assembler : public AbstractAssembler {
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
Expand Down Expand Up @@ -2110,6 +2113,8 @@ class Assembler : public AbstractAssembler {
void shlxl(Register dst, Register src1, Register src2);
void shlxq(Register dst, Register src1, Register src2);

void bzhiq(Register dst, Register src1, Register src2);
JoshuaZhuwj marked this conversation as resolved.
Show resolved Hide resolved

//====================VECTOR ARITHMETIC=====================================
void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
void evpmovq2m(KRegister kdst, XMMRegister src, int vector_len);
Expand Down
26 changes: 21 additions & 5 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,13 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}

void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
// assert(ArrayCopyPartialInlineSize <= 64,""); JDK-8261553 not introduced
mov64(temp, -1L);
bzhiq(temp, temp, len);
kmovql(dst, temp);
}
#endif // _LP64

void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
Expand Down Expand Up @@ -1033,6 +1040,15 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}


void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
Expand Down Expand Up @@ -1234,7 +1250,8 @@ void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask
}
}

void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
switch(vlen) {
case 4:
assert(vtmp1 != xnoreg, "required.");
Expand Down Expand Up @@ -1272,14 +1289,13 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist
break;
case 64:
{
KRegister ktemp = k2; // Use a hardcoded temp due to no k register allocation.
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
evpcmpeqb(ktemp, src1, src2, Assembler::AVX_512bit);
evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
if (bt == BoolTest::ne) {
ktestql(ktemp, ktemp);
ktestql(mask, mask);
} else {
assert(bt == BoolTest::overflow, "required");
kortestql(ktemp, ktemp);
kortestql(mask, mask);
}
}
break;
Expand Down
6 changes: 5 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);

void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);

// extract
void extract(BasicType typ, Register dst, XMMRegister src, int idx);
XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
Expand All @@ -75,7 +78,7 @@

// vector test
void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg);
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);

// blend
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
Expand All @@ -90,6 +93,7 @@
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void genmask(KRegister dst, Register len, Register temp);
#endif // _LP64

// dst = reduce(op, src2) using vtmp as temps
Expand Down
37 changes: 33 additions & 4 deletions src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -150,7 +150,6 @@ void ZBarrierSetAssembler::load_at(MacroAssembler* masm,
// Call VM
call_vm(masm, ZBarrierSetRuntime::load_barrier_on_oop_field_preloaded_addr(decorators), dst, scratch);

// Restore registers
__ movdqu(xmm0, Address(rsp, xmm_size * 0));
__ movdqu(xmm1, Address(rsp, xmm_size * 1));
__ movdqu(xmm2, Address(rsp, xmm_size * 2));
Expand Down Expand Up @@ -305,7 +304,7 @@ void ZBarrierSetAssembler::generate_c1_load_barrier_stub(LIR_Assembler* ce,
__ addptr(rsp, 2 * BytesPerWord);

// Verify result
__ verify_oop(rax, "Bad oop");
__ verify_oop(rax);

// Move result into place
if (ref != rax) {
Expand Down Expand Up @@ -395,6 +394,7 @@ class ZSaveLiveRegisters {

MacroAssembler* const _masm;
GrowableArray<Register> _gp_registers;
GrowableArray<KRegister> _opmask_registers;
GrowableArray<XMMRegisterData> _xmm_registers;
int _spill_size;
int _spill_offset;
Expand Down Expand Up @@ -451,11 +451,21 @@ class ZSaveLiveRegisters {
__ movq(Address(rsp, _spill_offset), reg);
}

void opmask_register_save(KRegister reg) {
_spill_offset -= 8;
__ kmovql(Address(rsp, _spill_offset), reg);
}

void gp_register_restore(Register reg) {
__ movq(reg, Address(rsp, _spill_offset));
_spill_offset += 8;
}

void opmask_register_restore(KRegister reg) {
__ kmovql(reg, Address(rsp, _spill_offset));
_spill_offset += 8;
}

void initialize(ZLoadBarrierStubC2* stub) {
// Create mask of caller saved registers that need to
// be saved/restored if live
Expand All @@ -478,6 +488,7 @@ class ZSaveLiveRegisters {
}

int gp_spill_size = 0;
int opmask_spill_size = 0;
int xmm_spill_size = 0;

// Record registers that needs to be saved/restored
Expand All @@ -492,6 +503,13 @@ class ZSaveLiveRegisters {
_gp_registers.append(vm_reg->as_Register());
gp_spill_size += 8;
}
} else if (vm_reg->is_KRegister()) {
// All opmask registers are caller saved, thus spill the ones
// which are live.
if (_opmask_registers.find(vm_reg->as_KRegister()) == -1) {
_opmask_registers.append(vm_reg->as_KRegister());
opmask_spill_size += 8;
}
} else if (vm_reg->is_XMMRegister()) {
// We encode in the low order 4 bits of the opto_reg, how large part of the register is live
const VMReg vm_reg_base = OptoReg::as_VMReg(opto_reg & ~15);
Expand Down Expand Up @@ -519,13 +537,14 @@ class ZSaveLiveRegisters {
_xmm_registers.sort(xmm_compare_register_size);

// Stack pointer must be 16 bytes aligned for the call
_spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size, 16);
_spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size + opmask_spill_size, 16);
}

public:
ZSaveLiveRegisters(MacroAssembler* masm, ZLoadBarrierStubC2* stub) :
_masm(masm),
_gp_registers(),
_opmask_registers(),
_xmm_registers(),
_spill_size(0),
_spill_offset(0) {
Expand Down Expand Up @@ -575,9 +594,19 @@ class ZSaveLiveRegisters {
for (int i = 0; i < _gp_registers.length(); i++) {
gp_register_save(_gp_registers.at(i));
}

// Save opmask registers
for (int i = 0; i < _opmask_registers.length(); i++) {
opmask_register_save(_opmask_registers.at(i));
}
}

~ZSaveLiveRegisters() {
// Restore opmask registers
for (int i = _opmask_registers.length() - 1; i >= 0; i--) {
opmask_register_restore(_opmask_registers.at(i));
}

// Restore general purpose registers
for (int i = _gp_registers.length() - 1; i >= 0; i--) {
gp_register_restore(_gp_registers.at(i));
Expand Down
Loading
Loading