diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 7005c4088b9..c46a65835ec 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -2495,6 +2495,22 @@ void Assembler::kmovwl(KRegister dst, Address src) { emit_operand((Register)dst, src); } +void Assembler::kmovwl(Address dst, KRegister src) { + assert(VM_Version::supports_evex(), ""); + InstructionMark im(this); + InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x91); + emit_operand((Register)src, dst); +} + +void Assembler::kmovwl(KRegister dst, KRegister src) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int16((unsigned char)0x90, (0xC0 | encode)); +} + void Assembler::kmovdl(KRegister dst, Register src) { assert(VM_Version::supports_avx512bw(), ""); InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); @@ -2815,6 +2831,22 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool mer emit_operand(dst, src); } +void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(src != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + attributes.set_is_evex_instruction(); + if (merge) { + attributes.reset_is_clear_context(); + } + vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8(0x7F); + emit_operand(src, dst); +} + void Assembler::evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionMark im(this); @@ -9438,6 +9470,13 @@ void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XM emit_int16(0x64, (0xC0 | encode)); } +void Assembler::bzhiq(Register dst, Register src1, Register src2) { + assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported"); + InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xF5, (0xC0 | encode)); +} + void Assembler::shlxl(Register dst, Register src1, Register src2) { assert(VM_Version::supports_bmi2(), ""); InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 0f4f36c76de..58a6ca7c9eb 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1494,6 +1494,8 @@ class Assembler : public AbstractAssembler { void kmovwl(KRegister dst, Register src); void kmovwl(KRegister dst, Address src); void kmovwl(Register dst, KRegister src); + void kmovwl(Address dst, KRegister src); + void kmovwl(KRegister dst, KRegister src); void kmovdl(KRegister dst, Register src); void kmovdl(Register dst, KRegister src); void kmovql(KRegister dst, KRegister src); @@ -1542,6 +1544,7 @@ class Assembler : public AbstractAssembler { void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len); void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len); void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len); + void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len); void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len); void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len); @@ -2110,6 +2113,8 @@ class Assembler : public AbstractAssembler { void shlxl(Register dst, Register src1, Register src2); void shlxq(Register dst, Register src1, Register src2); + void bzhiq(Register dst, Register src1, Register src2); + //====================VECTOR ARITHMETIC===================================== void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len); void evpmovq2m(KRegister kdst, XMMRegister src, int vector_len); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 89cdbee5604..24f163c78dc 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -987,6 +987,13 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); } + +void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) { + // assert(ArrayCopyPartialInlineSize <= 64,""); JDK-8261553 not introduced + mov64(temp, -1L); + bzhiq(temp, temp, len); + kmovql(dst, temp); +} #endif // _LP64 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { @@ -1033,6 +1040,15 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); } +void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { + MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); +} + +void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { + MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len); +} + + void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, @@ -1234,7 +1250,8 @@ void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask } } -void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { +void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, + XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) { switch(vlen) { case 4: assert(vtmp1 != xnoreg, "required."); @@ -1272,14 +1289,13 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist break; case 64: { - KRegister ktemp = k2; // Use a hardcoded temp due to no k register allocation. assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required."); - evpcmpeqb(ktemp, src1, src2, Assembler::AVX_512bit); + evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit); if (bt == BoolTest::ne) { - ktestql(ktemp, ktemp); + ktestql(mask, mask); } else { assert(bt == BoolTest::overflow, "required"); - kortestql(ktemp, ktemp); + kortestql(mask, mask); } } break; diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index 5e62c3705d3..ab574fb3716 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -67,6 +67,9 @@ void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len); void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len); + void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len); + void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len); + // extract void extract(BasicType typ, Register dst, XMMRegister src, int idx); XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex); @@ -75,7 +78,7 @@ // vector test void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, - XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg); + XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg); // blend void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1); @@ -90,6 +93,7 @@ void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); #ifdef _LP64 void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void genmask(KRegister dst, Register len, Register temp); #endif // _LP64 // dst = reduce(op, src2) using vtmp as temps diff --git a/src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp index fbe2e6c8956..5b6349b4452 100644 --- a/src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -150,7 +150,6 @@ void ZBarrierSetAssembler::load_at(MacroAssembler* masm, // Call VM call_vm(masm, ZBarrierSetRuntime::load_barrier_on_oop_field_preloaded_addr(decorators), dst, scratch); - // Restore registers __ movdqu(xmm0, Address(rsp, xmm_size * 0)); __ movdqu(xmm1, Address(rsp, xmm_size * 1)); __ movdqu(xmm2, Address(rsp, xmm_size * 2)); @@ -305,7 +304,7 @@ void ZBarrierSetAssembler::generate_c1_load_barrier_stub(LIR_Assembler* ce, __ addptr(rsp, 2 * BytesPerWord); // Verify result - __ verify_oop(rax, "Bad oop"); + __ verify_oop(rax); // Move result into place if (ref != rax) { @@ -395,6 +394,7 @@ class ZSaveLiveRegisters { MacroAssembler* const _masm; GrowableArray _gp_registers; + GrowableArray _opmask_registers; GrowableArray _xmm_registers; int _spill_size; int _spill_offset; @@ -451,11 +451,21 @@ class ZSaveLiveRegisters { __ movq(Address(rsp, _spill_offset), reg); } + void opmask_register_save(KRegister reg) { + _spill_offset -= 8; + __ kmovql(Address(rsp, _spill_offset), reg); + } + void gp_register_restore(Register reg) { __ movq(reg, Address(rsp, _spill_offset)); _spill_offset += 8; } + void opmask_register_restore(KRegister reg) { + __ kmovql(reg, Address(rsp, _spill_offset)); + _spill_offset += 8; + } + void initialize(ZLoadBarrierStubC2* stub) { // Create mask of caller saved registers that need to // be saved/restored if live @@ -478,6 +488,7 @@ class ZSaveLiveRegisters { } int gp_spill_size = 0; + int opmask_spill_size = 0; int xmm_spill_size = 0; // Record registers that needs to be saved/restored @@ -492,6 +503,13 @@ class ZSaveLiveRegisters { _gp_registers.append(vm_reg->as_Register()); gp_spill_size += 8; } + } else if (vm_reg->is_KRegister()) { + // All opmask registers are caller saved, thus spill the ones + // which are live. + if (_opmask_registers.find(vm_reg->as_KRegister()) == -1) { + _opmask_registers.append(vm_reg->as_KRegister()); + opmask_spill_size += 8; + } } else if (vm_reg->is_XMMRegister()) { // We encode in the low order 4 bits of the opto_reg, how large part of the register is live const VMReg vm_reg_base = OptoReg::as_VMReg(opto_reg & ~15); @@ -519,13 +537,14 @@ class ZSaveLiveRegisters { _xmm_registers.sort(xmm_compare_register_size); // Stack pointer must be 16 bytes aligned for the call - _spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size, 16); + _spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size + opmask_spill_size, 16); } public: ZSaveLiveRegisters(MacroAssembler* masm, ZLoadBarrierStubC2* stub) : _masm(masm), _gp_registers(), + _opmask_registers(), _xmm_registers(), _spill_size(0), _spill_offset(0) { @@ -575,9 +594,19 @@ class ZSaveLiveRegisters { for (int i = 0; i < _gp_registers.length(); i++) { gp_register_save(_gp_registers.at(i)); } + + // Save opmask registers + for (int i = 0; i < _opmask_registers.length(); i++) { + opmask_register_save(_opmask_registers.at(i)); + } } ~ZSaveLiveRegisters() { + // Restore opmask registers + for (int i = _opmask_registers.length() - 1; i >= 0; i--) { + opmask_register_restore(_opmask_registers.at(i)); + } + // Restore general purpose registers for (int i = _gp_registers.length() - 1; i >= 0; i--) { gp_register_restore(_gp_registers.at(i)); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index f411993b773..1fca58aa873 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -3498,18 +3498,18 @@ void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { } #ifdef COMPILER2 -void MacroAssembler::setvectmask(Register dst, Register src) { +void MacroAssembler::setvectmask(Register dst, Register src, KRegister mask) { guarantee(PostLoopMultiversioning, "must be"); Assembler::movl(dst, 1); Assembler::shlxl(dst, dst, src); Assembler::decl(dst); - Assembler::kmovdl(k1, dst); + Assembler::kmovdl(mask, dst); Assembler::movl(dst, src); } -void MacroAssembler::restorevectmask() { +void MacroAssembler::restorevectmask(KRegister mask) { guarantee(PostLoopMultiversioning, "must be"); - Assembler::knotwl(k1, k0); + Assembler::knotwl(mask, k0); } #endif // COMPILER2 @@ -3605,6 +3605,59 @@ void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scrat } } +void MacroAssembler::kmov(KRegister dst, Address src) { + if (VM_Version::supports_avx512bw()) { + kmovql(dst, src); + } else { + assert(VM_Version::supports_evex(), ""); + kmovwl(dst, src); + } +} + +void MacroAssembler::kmov(Address dst, KRegister src) { + if (VM_Version::supports_avx512bw()) { + kmovql(dst, src); + } else { + assert(VM_Version::supports_evex(), ""); + kmovwl(dst, src); + } +} + +void MacroAssembler::kmov(KRegister dst, KRegister src) { + if (VM_Version::supports_avx512bw()) { + kmovql(dst, src); + } else { + assert(VM_Version::supports_evex(), ""); + kmovwl(dst, src); + } +} + +void MacroAssembler::kmov(Register dst, KRegister src) { + if (VM_Version::supports_avx512bw()) { + kmovql(dst, src); + } else { + assert(VM_Version::supports_evex(), ""); + kmovwl(dst, src); + } +} + +void MacroAssembler::kmov(KRegister dst, Register src) { + if (VM_Version::supports_avx512bw()) { + kmovql(dst, src); + } else { + assert(VM_Version::supports_evex(), ""); + kmovwl(dst, src); + } +} + +void MacroAssembler::kmovql(KRegister dst, AddressLiteral src, Register scratch_reg) { + if (reachable(src)) { + kmovql(dst, as_Address(src)); + } else { + lea(scratch_reg, src); + kmovql(dst, Address(scratch_reg, 0)); + } +} void MacroAssembler::kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg) { if (reachable(src)) { @@ -6141,7 +6194,79 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp BIND(L_end); } -void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) { +// Clearing constant sized memory using YMM/ZMM registers. +void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) { + assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), ""); + bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0; + + int vector64_count = (cnt & (~0x7)) >> 3; + cnt = cnt & 0x7; + + // 64 byte initialization loop. + vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit); + for (int i = 0; i < vector64_count; i++) { + fill64_avx(base, i * 64, xtmp, use64byteVector); + } + + // Clear remaining 64 byte tail. + int disp = vector64_count * 64; + if (cnt) { + switch (cnt) { + case 1: + movq(Address(base, disp), xtmp); + break; + case 2: + evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit); + break; + case 3: + movl(rtmp, 0x7); + kmovwl(mask, rtmp); + evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit); + break; + case 4: + evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); + break; + case 5: + if (use64byteVector) { + movl(rtmp, 0x1F); + kmovwl(mask, rtmp); + evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); + } else { + evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); + movq(Address(base, disp + 32), xtmp); + } + break; + case 6: + if (use64byteVector) { + movl(rtmp, 0x3F); + kmovwl(mask, rtmp); + evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); + } else { + evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); + evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit); + } + break; + case 7: + if (use64byteVector) { + movl(rtmp, 0x7F); + kmovwl(mask, rtmp); + evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit); + } else { + evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit); + movl(rtmp, 0x7); + kmovwl(mask, rtmp); + evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit); + } + break; + default: + fatal("Unexpected length : %d\n",cnt); + break; + } + } +} + +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, + bool is_large, KRegister mask) { // cnt - number of qwords (8-byte words). // base - start address, qword aligned. // is_large - if optimizers know cnt is larger than InitArrayShortSize @@ -6182,7 +6307,7 @@ void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMReg shlptr(cnt, 3); // convert to number of bytes rep_stosb(); } else if (UseXMMForObjInit) { - movptr(tmp, base); + movptr(tmp, base); xmm_clear_mem(tmp, cnt, xtmp); } else { NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM @@ -6806,7 +6931,7 @@ void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register // Compare strings, used for char[] and byte[]. void MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, - XMMRegister vec1, int ae) { + XMMRegister vec1, int ae, KRegister mask) { ShortBranchVerifier sbv(this); Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 @@ -6959,12 +7084,12 @@ void MacroAssembler::string_compare(Register str1, Register str2, bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); - evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 + evpcmpeqb(mask, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 } else { vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); - evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 + evpcmpeqb(mask, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 } - kortestql(k7, k7); + kortestql(mask, mask); jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare addptr(result, stride2x2); // update since we already compared at this addr subl(cnt2, stride2x2); // and sub the size too @@ -7148,7 +7273,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); - kmovql(cnt1, k7); + kmovql(cnt1, mask); notq(cnt1); bsfq(cnt2, cnt1); if (ae != StrIntrinsicNode::LL) { @@ -7197,7 +7322,7 @@ void MacroAssembler::string_compare(Register str1, Register str2, // } void MacroAssembler::has_negatives(Register ary1, Register len, Register result, Register tmp1, - XMMRegister vec1, XMMRegister vec2) { + XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { // rsi: byte array // rcx: len // rax: result @@ -7229,8 +7354,8 @@ void MacroAssembler::has_negatives(Register ary1, Register len, bind(test_64_loop); // Check whether our 64 elements of size byte contain negatives - evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); - kortestql(k2, k2); + evpcmpgtb(mask1, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); + kortestql(mask1, mask1); jcc(Assembler::notZero, TRUE_LABEL); addptr(len, 64); @@ -7247,7 +7372,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len, mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); shlxq(tmp3_aliased, tmp3_aliased, tmp1); notq(tmp3_aliased); - kmovql(k3, tmp3_aliased); + kmovql(mask2, tmp3_aliased); #else Label k_init; jmp(k_init); @@ -7272,11 +7397,11 @@ void MacroAssembler::has_negatives(Register ary1, Register len, lea(len, InternalAddress(tmp)); // create mask to test for negative byte inside a vector evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); - evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); + evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit); #endif - evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); - ktestq(k2, k3); + evpcmpgtb(mask1, mask2, vec2, Address(ary1, 0), Assembler::AVX_512bit); + ktestq(mask1, mask2); jcc(Assembler::notZero, TRUE_LABEL); jmp(FALSE_LABEL); @@ -7403,7 +7528,7 @@ void MacroAssembler::has_negatives(Register ary1, Register len, // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, Register limit, Register result, Register chr, - XMMRegister vec1, XMMRegister vec2, bool is_char) { + XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask) { ShortBranchVerifier sbv(this); Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; @@ -7466,8 +7591,8 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); - evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); - kortestql(k7, k7); + evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); + kortestql(mask, mask); jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare addptr(limit, 64); // update since we already compared at this addr cmpl(limit, -64); @@ -7484,8 +7609,8 @@ void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ar // addptr(result, -64); // it is safe, bc we just came from this area evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); - evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); - kortestql(k7, k7); + evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); + kortestql(mask, mask); jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare jmp(TRUE_LABEL); @@ -10228,7 +10353,7 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe void MacroAssembler::char_array_compress(Register src, Register dst, Register len, XMMRegister tmp1Reg, XMMRegister tmp2Reg, XMMRegister tmp3Reg, XMMRegister tmp4Reg, - Register tmp5, Register result) { + Register tmp5, Register result, KRegister mask1, KRegister mask2) { Label copy_chars_loop, return_length, return_zero, done; // rsi: src @@ -10280,14 +10405,14 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le movl(result, 0xFFFFFFFF); shlxl(result, result, tmp5); notl(result); - kmovdl(k3, result); + kmovdl(mask2, result); - evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); - evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); - ktestd(k2, k3); + evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); + evpcmpuw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + ktestd(mask1, mask2); jcc(Assembler::carryClear, return_zero); - evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); + evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); addptr(src, tmp5); addptr(src, tmp5); @@ -10308,8 +10433,8 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le bind(copy_32_loop); evmovdquw(tmp1Reg, Address(src, len, Address::times_2), /*merge*/ false, Assembler::AVX_512bit); - evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); - kortestdl(k2, k2); + evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + kortestdl(mask1, mask1); jcc(Assembler::carryClear, return_zero); // All elements in current processed chunk are valid candidates for @@ -10330,14 +10455,14 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le shlxl(result, result, len); notl(result); - kmovdl(k3, result); + kmovdl(mask2, result); - evmovdquw(tmp1Reg, k3, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); - evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); - ktestd(k2, k3); + evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit); + evpcmpuw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); + ktestd(mask1, mask2); jcc(Assembler::carryClear, return_zero); - evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); + evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit); jmp(return_length); bind(below_threshold); @@ -10437,7 +10562,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le // } // } void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, - XMMRegister tmp1, Register tmp2) { + XMMRegister tmp1, Register tmp2, KRegister mask) { Label copy_chars_loop, done, below_threshold, avx3_threshold; // rsi: src // rdi: dst @@ -10490,9 +10615,9 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len movl(tmp3_aliased, -1); shlxl(tmp3_aliased, tmp3_aliased, tmp2); notl(tmp3_aliased); - kmovdl(k2, tmp3_aliased); - evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); - evmovdquw(Address(dst, 0), k2, tmp1, /*merge*/ true, Assembler::AVX_512bit); + kmovdl(mask, tmp3_aliased); + evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit); + evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit); jmp(done); bind(avx3_threshold); @@ -10578,6 +10703,111 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len bind(done); } +void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) { + switch(type) { + case T_BYTE: + case T_BOOLEAN: + evmovdqub(dst, kmask, src, false, vector_len); + break; + case T_CHAR: + case T_SHORT: + evmovdquw(dst, kmask, src, false, vector_len); + break; + case T_INT: + case T_FLOAT: + evmovdqul(dst, kmask, src, false, vector_len); + break; + case T_LONG: + case T_DOUBLE: + evmovdquq(dst, kmask, src, false, vector_len); + break; + default: + fatal("Unexpected type argument %s", type2name(type)); + break; + } +} + +void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) { + switch(type) { + case T_BYTE: + case T_BOOLEAN: + evmovdqub(dst, kmask, src, true, vector_len); + break; + case T_CHAR: + case T_SHORT: + evmovdquw(dst, kmask, src, true, vector_len); + break; + case T_INT: + case T_FLOAT: + evmovdqul(dst, kmask, src, true, vector_len); + break; + case T_LONG: + case T_DOUBLE: + evmovdquq(dst, kmask, src, true, vector_len); + break; + default: + fatal("Unexpected type argument %s", type2name(type)); + break; + } +} + +#if COMPILER2_OR_JVMCI + + +// Set memory operation for length "less than" 64 bytes. +void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp, + XMMRegister xmm, KRegister mask, Register length, + Register temp, bool use64byteVector) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)"); + BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + if (!use64byteVector) { + fill32_avx(dst, disp, xmm); + subptr(length, 32 >> shift); + fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp); + } else { + assert(MaxVectorSize == 64, "vector length != 64"); + movl(temp, 1); + shlxl(temp, temp, length); + subptr(temp, 1); + kmovwl(mask, temp); + evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit); + } +} + + +void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp, + XMMRegister xmm, KRegister mask, Register length, + Register temp) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)"); + BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG}; + movl(temp, 1); + shlxl(temp, temp, length); + subptr(temp, 1); + kmovwl(mask, temp); + evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit); +} + + +void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + vmovdqu(Address(dst, disp), xmm); +} + +void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) { + assert(MaxVectorSize >= 32, "vector length should be >= 32"); + BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG}; + if (!use64byteVector) { + fill32_avx(dst, disp, xmm); + fill32_avx(dst, disp + 32, xmm); + } else { + evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit); + } +} + +#endif //COMPILER2_OR_JVMCI + Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 461304c1d33..a5aec844f14 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -39,6 +39,10 @@ class MacroAssembler: public Assembler { friend class Runtime1; // as_Address() public: + + void setvectmask(Register dst, Register src, KRegister mask); + void restorevectmask(KRegister mask); + // Support for VM calls // // This is the base routine called by the different versions of call_VM_leaf. The interpreter @@ -1121,6 +1125,23 @@ class MacroAssembler: public Assembler { void kmovwl(Register dst, KRegister src) { Assembler::kmovwl(dst, src); } void kmovwl(KRegister dst, Address src) { Assembler::kmovwl(dst, src); } void kmovwl(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); + void kmovwl(Address dst, KRegister src) { Assembler::kmovwl(dst, src); } + void kmovwl(KRegister dst, KRegister src) { Assembler::kmovwl(dst, src); } + + void kmovql(KRegister dst, KRegister src) { Assembler::kmovql(dst, src); } + void kmovql(KRegister dst, Register src) { Assembler::kmovql(dst, src); } + void kmovql(Register dst, KRegister src) { Assembler::kmovql(dst, src); } + void kmovql(KRegister dst, Address src) { Assembler::kmovql(dst, src); } + void kmovql(Address dst, KRegister src) { Assembler::kmovql(dst, src); } + void kmovql(KRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); + + // Safe move operation, lowers down to 16bit moves for targets supporting + // AVX512F feature and 64bit moves for targets supporting AVX512BW feature. + void kmov(Address dst, KRegister src); + void kmov(KRegister dst, Address src); + void kmov(KRegister dst, KRegister src); + void kmov(Register dst, KRegister src); + void kmov(KRegister dst, Register src); // AVX Unaligned forms void vmovdqu(Address dst, XMMRegister src); @@ -1129,10 +1150,14 @@ class MacroAssembler: public Assembler { void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1); // AVX512 Unaligned + void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len); + void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len); + void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); } void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); } + void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); } void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg); void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); } @@ -1725,7 +1750,10 @@ class MacroAssembler: public Assembler { // clear memory of size 'cnt' qwords, starting at 'base'; // if 'is_large' is set, do not try to produce short loop - void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large); + void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg); + + // clear memory initialization sequence for constant size; + void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg); // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp); @@ -1760,18 +1788,18 @@ class MacroAssembler: public Assembler { // Compare strings. void string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result, - XMMRegister vec1, int ae); + XMMRegister vec1, int ae, KRegister mask = knoreg); // Search for Non-ASCII character (Negative byte value) in a byte array, // return true if it has any and false otherwise. void has_negatives(Register ary1, Register len, Register result, Register tmp1, - XMMRegister vec1, XMMRegister vec2); + XMMRegister vec1, XMMRegister vec2, KRegister mask1 = knoreg, KRegister mask2 = knoreg); // Compare char[] or byte[] arrays. void arrays_equals(bool is_array_equ, Register ary1, Register ary2, Register limit, Register result, Register chr, - XMMRegister vec1, XMMRegister vec2, bool is_char); + XMMRegister vec1, XMMRegister vec2, bool is_char, KRegister mask = knoreg); #endif @@ -1886,11 +1914,24 @@ class MacroAssembler: public Assembler { // Compress char[] array to byte[]. void char_array_compress(Register src, Register dst, Register len, XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3, - XMMRegister tmp4, Register tmp5, Register result); + XMMRegister tmp4, Register tmp5, Register result, + KRegister mask1 = knoreg, KRegister mask2 = knoreg); // Inflate byte[] array to char[]. void byte_array_inflate(Register src, Register dst, Register len, - XMMRegister tmp1, Register tmp2); + XMMRegister tmp1, Register tmp2, KRegister mask = knoreg); + + void fill64_masked_avx(uint shift, Register dst, int disp, + XMMRegister xmm, KRegister mask, Register length, + Register temp, bool use64byteVector = false); + + void fill32_masked_avx(uint shift, Register dst, int disp, + XMMRegister xmm, KRegister mask, Register length, + Register temp); + + void fill32_avx(Register dst, int disp, XMMRegister xmm); + + void fill64_avx(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false); void vallones(XMMRegister dst, int vector_len); }; diff --git a/src/hotspot/cpu/x86/register_x86.hpp b/src/hotspot/cpu/x86/register_x86.hpp index 8cc85e0878b..df65003308d 100644 --- a/src/hotspot/cpu/x86/register_x86.hpp +++ b/src/hotspot/cpu/x86/register_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -236,7 +236,9 @@ class KRegisterImpl : public AbstractRegisterImpl { public: enum { number_of_registers = 8, - max_slots_per_register = 1 + // opmask registers are 64bit wide on both 32 and 64 bit targets. + // thus two slots are reserved per register. + max_slots_per_register = 2 }; // construction @@ -274,10 +276,14 @@ class ConcreteRegisterImpl : public AbstractRegisterImpl { // There is no requirement that any ordering here matches any ordering c2 gives // it's optoregs. + // x86_32.ad defines additional dummy FILL0-FILL7 registers, in order to tally + // REG_COUNT (computed by ADLC based on the number of reg_defs seen in .ad files) + // with ConcreteRegisterImpl::number_of_registers additional count of 8 is being + // added for 32 bit jvm. number_of_registers = RegisterImpl::number_of_registers * RegisterImpl::max_slots_per_register + - 2 * FloatRegisterImpl::number_of_registers + + 2 * FloatRegisterImpl::number_of_registers + NOT_LP64(8) LP64_ONLY(0) + XMMRegisterImpl::max_slots_per_register * XMMRegisterImpl::number_of_registers + - KRegisterImpl::number_of_registers + // mask registers + KRegisterImpl::number_of_registers * KRegisterImpl::max_slots_per_register + // mask registers 1 // eflags }; diff --git a/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp b/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp index 20b2ff1b3a4..93f5fad63b2 100644 --- a/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp +++ b/src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp @@ -129,6 +129,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ int ymm_bytes = num_xmm_regs * 16; int zmm_bytes = num_xmm_regs * 32; #ifdef COMPILER2 + int opmask_state_bytes = KRegisterImpl::number_of_registers * 8; if (save_vectors) { assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported"); @@ -137,6 +138,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ if (UseAVX > 2) { // Save upper half of ZMM registers as well vect_bytes += zmm_bytes; + additional_frame_words += opmask_state_bytes / wordSize; } additional_frame_words += vect_bytes / wordSize; } @@ -227,6 +229,11 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int n = 0; n < num_xmm_regs; n++) { __ vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n)); } + __ subptr(rsp, opmask_state_bytes); + // Save opmask registers + for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(Address(rsp, n*8), as_KRegister(n)); + } } } __ vzeroupper(); @@ -249,6 +256,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ // rbp, location is known implicitly, no oopMap map->set_callee_saved(STACK_OFFSET(rsi_off), rsi->as_VMReg()); map->set_callee_saved(STACK_OFFSET(rdi_off), rdi->as_VMReg()); + // %%% This is really a waste but we'll keep things as they were for now for the upper component off = st0_off; delta = st1_off - off; @@ -273,11 +281,12 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { + int opmask_state_bytes = 0; + int additional_frame_bytes = 0; int num_xmm_regs = XMMRegisterImpl::number_of_registers; int ymm_bytes = num_xmm_regs * 16; int zmm_bytes = num_xmm_regs * 32; // Recover XMM & FPU state - int additional_frame_bytes = 0; #ifdef COMPILER2 if (restore_vectors) { assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX"); @@ -287,6 +296,8 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve if (UseAVX > 2) { // Save upper half of ZMM registers as well additional_frame_bytes += zmm_bytes; + opmask_state_bytes = KRegisterImpl::number_of_registers * 8; + additional_frame_bytes += opmask_state_bytes; } } #else @@ -320,11 +331,14 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve for (int n = 0; n < num_xmm_regs; n++) { __ vinsertf128_high(as_XMMRegister(n), Address(rsp, n*16+off)); } - if (UseAVX > 2) { // Restore upper half of ZMM registers. + off = opmask_state_bytes; for (int n = 0; n < num_xmm_regs; n++) { - __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32)); + __ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32+off)); + } + for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(as_KRegister(n), Address(rsp, n*8)); } } __ addptr(rsp, additional_frame_bytes); diff --git a/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp b/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp index 3bab1d14d09..0c558bd7ed8 100644 --- a/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp +++ b/src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp @@ -95,11 +95,13 @@ class RegisterSaver { // units because compiler frame slots are jints. #define XSAVE_AREA_BEGIN 160 #define XSAVE_AREA_YMM_BEGIN 576 +#define XSAVE_AREA_OPMASK_BEGIN 1088 #define XSAVE_AREA_ZMM_BEGIN 1152 #define XSAVE_AREA_UPPERBANK 1664 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off #define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum)*32/BytesPerInt, zmm ## regnum ## H_off +#define DEF_OPMASK_OFFS(regnum) opmask ## regnum ## _off = opmask_off + (regnum)*8/BytesPerInt, opmask ## regnum ## H_off #define DEF_ZMM_UPPER_OFFS(regnum) zmm ## regnum ## _off = zmm_upper_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off enum layout { fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area @@ -111,6 +113,10 @@ class RegisterSaver { DEF_YMM_OFFS(0), DEF_YMM_OFFS(1), // 2..15 are implied in range usage + opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, + DEF_OPMASK_OFFS(0), + DEF_OPMASK_OFFS(1), + // 2..7 are implied in range usage zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt, DEF_ZMM_OFFS(0), DEF_ZMM_OFFS(1), @@ -218,6 +224,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int n = 16; n < num_xmm_regs; n++) { __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); } +#if COMPILER2_OR_JVMCI + base_addr = XSAVE_AREA_OPMASK_BEGIN; + off = 0; + for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); + } +#endif } } else { if (VM_Version::supports_evex()) { @@ -228,6 +241,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_ for (int n = 16; n < num_xmm_regs; n++) { __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len); } +#if COMPILER2_OR_JVMCI + base_addr = XSAVE_AREA_OPMASK_BEGIN; + off = 0; + for(int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(Address(rsp, base_addr+(off++*8)), as_KRegister(n)); + } +#endif } } __ vzeroupper(); @@ -387,6 +407,13 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wi for (int n = 16; n < num_xmm_regs; n++) { __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); } +#if COMPILER2_OR_JVMCI + base_addr = XSAVE_AREA_OPMASK_BEGIN; + off = 0; + for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); + } +#endif } } else { if (VM_Version::supports_evex()) { @@ -397,6 +424,13 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_wi for (int n = 16; n < num_xmm_regs; n++) { __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len); } +#if COMPILER2_OR_JVMCI + base_addr = XSAVE_AREA_OPMASK_BEGIN; + off = 0; + for (int n = 0; n < KRegisterImpl::number_of_registers; n++) { + __ kmov(as_KRegister(n), Address(rsp, base_addr+(off++*8))); + } +#endif } } diff --git a/src/hotspot/cpu/x86/vmreg_x86.hpp b/src/hotspot/cpu/x86/vmreg_x86.hpp index b382110d2a5..259a3b0c3fa 100644 --- a/src/hotspot/cpu/x86/vmreg_x86.hpp +++ b/src/hotspot/cpu/x86/vmreg_x86.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -82,7 +82,7 @@ inline XMMRegister as_XMMRegister() { inline KRegister as_KRegister() { assert(is_KRegister(), "must be"); // Yuk - return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm)); + return ::as_KRegister((value() - ConcreteRegisterImpl::max_xmm) >> 1); } inline bool is_concrete() { diff --git a/src/hotspot/cpu/x86/vmreg_x86.inline.hpp b/src/hotspot/cpu/x86/vmreg_x86.inline.hpp index bad5235c35d..3bfe99cd0c5 100644 --- a/src/hotspot/cpu/x86/vmreg_x86.inline.hpp +++ b/src/hotspot/cpu/x86/vmreg_x86.inline.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -43,7 +43,7 @@ inline VMReg XMMRegisterImpl::as_VMReg() { } inline VMReg KRegisterImpl::as_VMReg() { - return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_xmm); + return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_xmm); } #endif // CPU_X86_VM_VMREG_X86_INLINE_HPP diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 7dd6dd4bde2..35ad1209af7 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -628,6 +628,29 @@ reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad()); reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad()); #endif // _LP64 +// AVX3 Mask Registers. +reg_def K1 (SOC, SOC, Op_RegI, 1, k1->as_VMReg()); +reg_def K1_H (SOC, SOC, Op_RegI, 1, k1->as_VMReg()->next()); + +reg_def K2 (SOC, SOC, Op_RegI, 2, k2->as_VMReg()); +reg_def K2_H (SOC, SOC, Op_RegI, 2, k2->as_VMReg()->next()); + +reg_def K3 (SOC, SOC, Op_RegI, 3, k3->as_VMReg()); +reg_def K3_H (SOC, SOC, Op_RegI, 3, k3->as_VMReg()->next()); + +reg_def K4 (SOC, SOC, Op_RegI, 4, k4->as_VMReg()); +reg_def K4_H (SOC, SOC, Op_RegI, 4, k4->as_VMReg()->next()); + +reg_def K5 (SOC, SOC, Op_RegI, 5, k5->as_VMReg()); +reg_def K5_H (SOC, SOC, Op_RegI, 5, k5->as_VMReg()->next()); + +reg_def K6 (SOC, SOC, Op_RegI, 6, k6->as_VMReg()); +reg_def K6_H (SOC, SOC, Op_RegI, 6, k6->as_VMReg()->next()); + +reg_def K7 (SOC, SOC, Op_RegI, 7, k7->as_VMReg()); +reg_def K7_H (SOC, SOC, Op_RegI, 7, k7->as_VMReg()->next()); + + alloc_class chunk1(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p, XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p, XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p, @@ -664,8 +687,32 @@ alloc_class chunk1(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, #endif ); +alloc_class chunk2(K7, K7_H, + K6, K6_H, + K5, K5_H, + K4, K4_H, + K3, K3_H, + K2, K2_H, + K1, K1_H); + +reg_class vectmask_reg(K2, K2_H, + K3, K3_H, + K4, K4_H, + K5, K5_H, + K6, K6_H, + K7, K7_H); + +reg_class vectmask_reg_K1(K1, K1_H); +reg_class vectmask_reg_K2(K2, K2_H); +reg_class vectmask_reg_K3(K3, K3_H); +reg_class vectmask_reg_K4(K4, K4_H); +reg_class vectmask_reg_K5(K5, K5_H); +reg_class vectmask_reg_K6(K6, K6_H); +reg_class vectmask_reg_K7(K7, K7_H); + // flags allocation class should be last. -alloc_class chunk2(RFLAGS); +alloc_class chunk3(RFLAGS); + // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -1460,6 +1507,7 @@ const bool Matcher::match_rule_supported(int opcode) { if (!has_match_rule(opcode)) { return false; // no match rule present } + const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); switch (opcode) { case Op_AbsVL: case Op_StoreVectorScatter: @@ -1624,6 +1672,15 @@ const bool Matcher::match_rule_supported(int opcode) { return false; } break; + + case Op_VectorMaskGen: + case Op_LoadVectorMasked: + case Op_StoreVectorMasked: + if (!is_LP64 || UseAVX < 3 || !VM_Version::supports_bmi2()) { + return false; + } + break; + case Op_SqrtF: if (UseSSE < 1) { return false; @@ -1648,6 +1705,7 @@ const bool Matcher::match_rule_supported(int opcode) { // Identify extra cases that we might want to provide match rules for vector nodes and // other intrinsics guarded with vector length (vlen) and element type (bt). const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { + const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); if (!match_rule_supported(opcode)) { return false; } @@ -1692,6 +1750,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType return false; } break; + case Op_VectorMaskGen: + case Op_LoadVectorMasked: + case Op_StoreVectorMasked: + if (!is_LP64 || !VM_Version::supports_avx512bw()) { + return false; + } + if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) { + return false; + } + break; case Op_CMoveVD: if (vlen != 4) { return false; // implementation limitation (only vcmov4D_reg is present) @@ -1904,6 +1972,14 @@ const bool Matcher::has_predicated_vectors(void) { return ret_value; } +const RegMask* Matcher::predicate_reg_mask(void) { + return &_VECTMASK_REG_mask; +} + +const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) { + return new TypeVectMask(TypeInt::BOOL, length); +} + const int Matcher::float_pressure(int default_pressure_threshold) { int float_pressure_threshold = default_pressure_threshold; #ifdef _LP64 @@ -2578,14 +2654,18 @@ instruct ShouldNotReachHere() %{ %} // =================================EVEX special=============================== - -instruct setMask(rRegI dst, rRegI src) %{ - predicate(Matcher::has_predicated_vectors()); +// Existing partial implementation for post-loop multi-versioning computes +// the mask corresponding to tail loop in K1 opmask register. This may then be +// used for predicating instructions in loop body during last post-loop iteration. +// TODO: Remove hard-coded K1 usage while fixing existing post-loop +// multiversioning support. +instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{ + predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors()); match(Set dst (SetVectMaskI src)); effect(TEMP dst); format %{ "setvectmask $dst, $src" %} ins_encode %{ - __ setvectmask($dst$$Register, $src$$Register); + __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister); %} ins_pipe(pipe_slow); %} @@ -3629,10 +3709,10 @@ instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{ ins_pipe( pipe_slow ); %} -instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{ +instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{ predicate(vector_length_in_bytes(n) == 64); match(Set dst (LoadVectorGather mem idx)); - effect(TEMP dst, TEMP tmp); + effect(TEMP dst, TEMP tmp, TEMP ktmp); format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %} ins_encode %{ assert(UseAVX > 2, "sanity"); @@ -3642,10 +3722,9 @@ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{ assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE - KRegister ktmp = k2; - __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register); + __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register); __ lea($tmp$$Register, $mem$$Address); - __ evgather(elem_bt, $dst$$XMMRegister, ktmp, $tmp$$Register, $idx$$XMMRegister, vlen_enc); + __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -3654,23 +3733,21 @@ instruct evgather(vec dst, memory mem, vec idx, rRegP tmp) %{ // Scatter INT, LONG, FLOAT, DOUBLE -instruct scatter(memory mem, vec src, vec idx, rRegP tmp) %{ +instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{ + predicate(UseAVX > 2); match(Set mem (StoreVectorScatter mem (Binary src idx))); - effect(TEMP tmp); + effect(TEMP tmp, TEMP ktmp); format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %} ins_encode %{ - assert(UseAVX > 2, "sanity"); - int vlen_enc = vector_length_encoding(this, $src); BasicType elem_bt = vector_element_basic_type(this, $src); assert(vector_length_in_bytes(this, $src) >= 16, "sanity"); assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE - KRegister ktmp = k2; - __ kmovwl(k2, ExternalAddress(vector_all_bits_set()), $tmp$$Register); + __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register); __ lea($tmp$$Register, $mem$$Address); - __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, ktmp, $src$$XMMRegister, vlen_enc); + __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -5790,12 +5867,12 @@ instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, l ins_pipe( pipe_slow ); %} -instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp) %{ +instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{ predicate(vector_length_in_bytes(n) == 64 && is_floating_point_type(vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE match(Set dst (MinV a b)); match(Set dst (MaxV a b)); - effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp); + effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp); format %{ "vector_minmaxFP $dst,$a,$b\t!using $atmp, $btmp as TEMP" %} ins_encode %{ assert(UseAVX > 2, "required"); @@ -5804,10 +5881,9 @@ instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp) %{ int vlen_enc = vector_length_encoding(this); BasicType elem_bt = vector_element_basic_type(this); - KRegister ktmp = k1; __ evminmax_fp(opcode, elem_bt, $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, - ktmp, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc); + $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -6832,23 +6908,22 @@ instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{ ins_pipe( pipe_slow ); %} -instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ +instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{ predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 is_floating_point_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); - effect(TEMP scratch); + effect(TEMP scratch, TEMP ktmp); format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} ins_encode %{ int vlen_enc = Assembler::AVX_512bit; Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant); - KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. KRegister mask = k0; // The comparison itself is not being masked. if (vector_element_basic_type(this, $src1) == T_FLOAT) { - __ evcmpps(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); + __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); } else { - __ evcmppd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); + __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register); } %} ins_pipe( pipe_slow ); @@ -6870,41 +6945,40 @@ instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) % ins_pipe( pipe_slow ); %} -instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch) %{ +instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{ predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1 is_integral_type(vector_element_basic_type(n->in(1)->in(1)))); // src1 match(Set dst (VectorMaskCmp (Binary src1 src2) cond)); - effect(TEMP scratch); + effect(TEMP scratch, TEMP ktmp); format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %} ins_encode %{ assert(UseAVX > 2, "required"); int vlen_enc = Assembler::AVX_512bit; Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant); - KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation. KRegister mask = k0; // The comparison itself is not being masked. bool merge = false; BasicType src1_elem_bt = vector_element_basic_type(this, $src1); switch (src1_elem_bt) { case T_BYTE: { - __ evpcmpb(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + __ evpcmpb($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqub($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); break; } case T_SHORT: { - __ evpcmpw(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdquw($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + __ evpcmpw($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquw($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); break; } case T_INT: { - __ evpcmpd(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdqul($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); break; } case T_LONG: { - __ evpcmpq(ktmp, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); - __ evmovdquq($dst$$XMMRegister, ktmp, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); + __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc); + __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register); break; } @@ -7082,17 +7156,16 @@ instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{ ins_pipe( pipe_slow ); %} -instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch) %{ +instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{ predicate(vector_length_in_bytes(n) == 64); match(Set dst (VectorBlend (Binary src1 src2) mask)); format %{ "vector_blend $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %} - effect(TEMP scratch); + effect(TEMP scratch, TEMP ktmp); ins_encode %{ int vlen_enc = Assembler::AVX_512bit; BasicType elem_bt = vector_element_basic_type(this); - KRegister ktmp = k2; - __ evpcmp(elem_bt, ktmp, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register); - __ evpblend(elem_bt, $dst$$XMMRegister, ktmp, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc); + __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register); + __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc); %} ins_pipe( pipe_slow ); %} @@ -7235,13 +7308,29 @@ instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ predicate(vector_length_in_bytes(n->in(1)) >= 16 && + vector_length_in_bytes(n->in(1)) < 64 && static_cast(n)->get_predicate() == BoolTest::overflow); match(Set dst (VectorTest src1 src2 )); effect(KILL cr); format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %} ins_encode %{ int vlen = vector_length_in_bytes(this, $src1); - __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister); + __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg); + __ setb(Assembler::carrySet, $dst$$Register); + __ movzbl($dst$$Register, $dst$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vptest_alltrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{ + predicate(vector_length_in_bytes(n->in(1)) == 64 && + static_cast(n)->get_predicate() == BoolTest::overflow); + match(Set dst (VectorTest src1 src2 )); + effect(KILL cr, TEMP ktmp); + format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this, $src1); + __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister); __ setb(Assembler::carrySet, $dst$$Register); __ movzbl($dst$$Register, $dst$$Register); %} @@ -7266,13 +7355,29 @@ instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, r instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{ predicate(vector_length_in_bytes(n->in(1)) >= 16 && + vector_length_in_bytes(n->in(1)) < 64 && static_cast(n)->get_predicate() == BoolTest::ne); match(Set dst (VectorTest src1 src2 )); effect(KILL cr); format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %} ins_encode %{ int vlen = vector_length_in_bytes(this, $src1); - __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister); + __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg); + __ setb(Assembler::notZero, $dst$$Register); + __ movzbl($dst$$Register, $dst$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vptest_anytrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{ + predicate(vector_length_in_bytes(n->in(1)) == 64 && + static_cast(n)->get_predicate() == BoolTest::ne); + match(Set dst (VectorTest src1 src2 )); + effect(KILL cr, TEMP ktmp); + format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this, $src1); + __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister); __ setb(Assembler::notZero, $dst$$Register); __ movzbl($dst$$Register, $dst$$Register); %} @@ -7295,12 +7400,26 @@ instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 z instruct cmpvptest_anytrue(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{ predicate(vector_length_in_bytes(n->in(1)->in(1)) >= 16 && + vector_length_in_bytes(n->in(1)->in(1)) < 64 && static_cast(n->in(1))->get_predicate() == BoolTest::ne); match(Set cr (CmpI (VectorTest src1 src2) zero)); format %{ "cmp_vector_test_any_true $src1,$src2\t!" %} ins_encode %{ int vlen = vector_length_in_bytes(this, $src1); - __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister); + __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct cmpvptest_anytrue_evex(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, kReg ktmp) %{ + predicate(vector_length_in_bytes(n->in(1)->in(1)) == 64 && + static_cast(n->in(1))->get_predicate() == BoolTest::ne); + match(Set cr (CmpI (VectorTest src1 src2) zero)); + effect(TEMP ktmp); + format %{ "cmp_vector_test_any_true $src1,$src2\t!" %} + ins_encode %{ + int vlen = vector_length_in_bytes(this, $src1); + __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -7887,3 +8006,50 @@ instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{ %} ins_pipe( pipe_slow ); %} + +#ifdef _LP64 +// ---------------------------------- Masked Block Copy ------------------------------------ +instruct vmasked_load64(vec dst, memory mem, kReg mask) %{ + match(Set dst (LoadVectorMasked mem mask)); + format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %} + ins_encode %{ + BasicType elmType = this->bottom_type()->is_vect()->element_basic_type(); + int vector_len = vector_length_encoding(this); + __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{ + match(Set dst (VectorMaskGen len)); + effect(TEMP temp); + format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %} + ins_encode %{ + __ genmask($dst$$KRegister, $len$$Register, $temp$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{ + match(Set dst (VectorMaskGen len)); + format %{ "vector_mask_gen $len \t! vector mask generator" %} + effect(TEMP temp); + ins_encode %{ + __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant))); + __ kmovql($dst$$KRegister, $temp$$Register); + %} + ins_pipe( pipe_slow ); +%} + +instruct vmasked_store64(memory mem, vec src, kReg mask) %{ + match(Set mem (StoreVectorMasked mem (Binary src mask))); + format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %} + ins_encode %{ + const MachNode* src_node = static_cast(this->in(this->operand_index($src))); + BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type(); + int vector_len = vector_length_encoding(src_node); + __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len); + %} + ins_pipe( pipe_slow ); +%} +#endif // _LP64 diff --git a/src/hotspot/cpu/x86/x86_32.ad b/src/hotspot/cpu/x86/x86_32.ad index 1a1275b0de9..916a061d272 100644 --- a/src/hotspot/cpu/x86/x86_32.ad +++ b/src/hotspot/cpu/x86/x86_32.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -739,7 +739,7 @@ int MachEpilogNode::safepoint_offset() const { return 0; } //============================================================================= -enum RC { rc_bad, rc_int, rc_float, rc_xmm, rc_stack }; +enum RC { rc_bad, rc_int, rc_kreg, rc_float, rc_xmm, rc_stack }; static enum RC rc_class( OptoReg::Name reg ) { if( !OptoReg::is_valid(reg) ) return rc_bad; @@ -1087,7 +1087,7 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo if( src_first == dst_first && src_second == dst_second ) return size; // Self copy, no move - if (bottom_type()->isa_vect() != NULL) { + if (bottom_type()->isa_vect() != NULL && bottom_type()->isa_vectmask() == NULL) { uint ireg = ideal_reg(); assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity"); assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity"); @@ -1139,7 +1139,7 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size, st); // Check for integer load - if( dst_first_rc == rc_int && src_first_rc == rc_stack ) + if( src_first_rc == rc_stack && dst_first_rc == rc_int ) size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size, st); // Check for integer reg-xmm reg copy @@ -1228,16 +1228,16 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo // Check for xmm store if( src_first_rc == rc_xmm && dst_first_rc == rc_stack ) { - return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size, st); + return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first), src_first, src_second, size, st); } // Check for float xmm load - if( dst_first_rc == rc_xmm && src_first_rc == rc_stack ) { + if( src_first_rc == rc_stack && dst_first_rc == rc_xmm ) { return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size, st); } // Copy from float reg to xmm reg - if( dst_first_rc == rc_xmm && src_first_rc == rc_float ) { + if( src_first_rc == rc_float && dst_first_rc == rc_xmm ) { // copy to the top of stack from floating point reg // and use LEA to preserve flags if( cbuf ) { @@ -1293,6 +1293,42 @@ uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bo if( dst_second_rc == rc_int && src_second_rc == rc_stack ) return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size, st); + // AVX-512 opmask specific spilling. + if (src_first_rc == rc_stack && dst_first_rc == rc_kreg) { + assert((src_first & 1) == 0 && src_first + 1 == src_second, "invalid register pair"); + assert((dst_first & 1) == 0 && dst_first + 1 == dst_second, "invalid register pair"); + MacroAssembler _masm(cbuf); + int offset = ra_->reg2offset(src_first); + __ kmov(as_KRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset)); + return 0; + } + + if (src_first_rc == rc_kreg && dst_first_rc == rc_stack) { + assert((src_first & 1) == 0 && src_first + 1 == src_second, "invalid register pair"); + assert((dst_first & 1) == 0 && dst_first + 1 == dst_second, "invalid register pair"); + MacroAssembler _masm(cbuf); + int offset = ra_->reg2offset(dst_first); + __ kmov(Address(rsp, offset), as_KRegister(Matcher::_regEncode[src_first])); + return 0; + } + + if (src_first_rc == rc_kreg && dst_first_rc == rc_int) { + Unimplemented(); + return 0; + } + + if (src_first_rc == rc_int && dst_first_rc == rc_kreg) { + Unimplemented(); + return 0; + } + + if (src_first_rc == rc_kreg && dst_first_rc == rc_kreg) { + assert((src_first & 1) == 0 && src_first + 1 == src_second, "invalid register pair"); + assert((dst_first & 1) == 0 && dst_first + 1 == dst_second, "invalid register pair"); + MacroAssembler _masm(cbuf); + __ kmov(as_KRegister(Matcher::_regEncode[dst_first]), as_KRegister(Matcher::_regEncode[src_first])); + return 0; + } Unimplemented(); return 0; // Mute compiler @@ -3716,6 +3752,72 @@ operand immI_65535() %{ interface(CONST_INTER); %} +operand kReg() +%{ + constraint(ALLOC_IN_RC(vectmask_reg)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K1() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K1)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K2() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K2)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +// Special Registers +operand kReg_K3() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K3)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K4() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K4)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K5() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K5)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K6() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K6)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +// Special Registers +operand kReg_K7() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K7)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + // Register Operands // Integer Register operand rRegI() %{ @@ -11631,8 +11733,10 @@ instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{ // ======================================================================= // fast clearing of an array +// Small ClearArray non-AVX512. instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ - predicate(!((ClearArrayNode*)n)->is_large()); + predicate(!((ClearArrayNode*)n)->is_large() && + (UseAVX <= 2 || !VM_Version::supports_avx512vlbw())); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); @@ -11685,13 +11789,76 @@ instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe du %} ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, - $tmp$$XMMRegister, false); + $tmp$$XMMRegister, false, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +// Small ClearArray AVX512 non-constant length. +instruct rep_stos_evex(eCXRegI cnt, eDIRegP base, regD tmp, kReg ktmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(!((ClearArrayNode*)n)->is_large() && + UseAVX > 2 && VM_Version::supports_avx512vlbw() && + !n->in(2)->bottom_type()->is_int()->is_con()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr); + + format %{ $$template + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" + $$emit$$"CMP InitArrayShortSize,rcx\n\t" + $$emit$$"JG LARGE\n\t" + $$emit$$"SHL ECX, 1\n\t" + $$emit$$"DEC ECX\n\t" + $$emit$$"JS DONE\t# Zero length\n\t" + $$emit$$"MOV EAX,(EDI,ECX,4)\t# LOOP\n\t" + $$emit$$"DEC ECX\n\t" + $$emit$$"JGE LOOP\n\t" + $$emit$$"JMP DONE\n\t" + $$emit$$"# LARGE:\n\t" + if (UseFastStosb) { + $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t" + $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"MOV RDI,RAX\n\t" + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t" + $$emit$$"JMPQ L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t" + $$emit$$"ADD 0x40,RAX\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"SUB 0x8,RCX\n\t" + $$emit$$"JGE L_loop\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JL L_tail\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"ADD 0x20,RAX\n\t" + $$emit$$"SUB 0x4,RCX\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JLE L_end\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"VMOVQ XMM0,(RAX)\n\t" + $$emit$$"ADD 0x8,RAX\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"JGE L_sloop\n\t" + $$emit$$"# L_end:\n\t" + } else { + $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t" + $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t" + } + $$emit$$"# DONE" + %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, false, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} +// Large ClearArray non-AVX512. instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ - predicate(((ClearArrayNode*)n)->is_large()); + predicate(UseAVX <= 2 && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); format %{ $$template @@ -11734,14 +11901,79 @@ instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Unive %} ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, - $tmp$$XMMRegister, true); + $tmp$$XMMRegister, true, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +// Large ClearArray AVX512. +instruct rep_stos_large_evex(eCXRegI cnt, eDIRegP base, regD tmp, kReg ktmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + predicate(UseAVX > 2 && ((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr); + format %{ $$template + if (UseFastStosb) { + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" + $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t" + $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"MOV RDI,RAX\t# ClearArray:\n\t" + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t" + $$emit$$"JMPQ L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t" + $$emit$$"ADD 0x40,RAX\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"SUB 0x8,RCX\n\t" + $$emit$$"JGE L_loop\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JL L_tail\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"ADD 0x20,RAX\n\t" + $$emit$$"SUB 0x4,RCX\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JLE L_end\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"VMOVQ XMM0,(RAX)\n\t" + $$emit$$"ADD 0x8,RAX\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"JGE L_sloop\n\t" + $$emit$$"# L_end:\n\t" + } else { + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" + $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t" + $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t" + } + $$emit$$"# DONE" + %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, true, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} +// Small ClearArray AVX512 constant length. +instruct rep_stos_im(immI cnt, kReg ktmp, eRegP base, regD tmp, rRegI zero, Universe dummy, eFlagsReg cr) +%{ + predicate(!((ClearArrayNode*)n)->is_large() && + (UseAVX > 2 && VM_Version::supports_avx512vlbw() && + n->in(2)->bottom_type()->is_int()->is_con())); + match(Set dummy (ClearArray cnt base)); + effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr); + format %{ "clear_mem_imm $base , $cnt \n\t" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister); + %} + ins_pipe(pipe_slow); +%} + instruct string_compareL(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, eAXRegI result, regD tmp1, eFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11749,14 +11981,29 @@ instruct string_compareL(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::LL); + $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareL_evex(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, + eAXRegI result, regD tmp1, kReg ktmp, eFlagsReg cr) %{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::LL, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} instruct string_compareU(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, eAXRegI result, regD tmp1, eFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11764,14 +12011,29 @@ instruct string_compareU(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::UU); + $tmp1$$XMMRegister, StrIntrinsicNode::UU, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareU_evex(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, + eAXRegI result, regD tmp1, kReg ktmp, eFlagsReg cr) %{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare char[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::UU, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} instruct string_compareLU(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, eAXRegI result, regD tmp1, eFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11779,14 +12041,29 @@ instruct string_compareLU(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2 ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::LU); + $tmp1$$XMMRegister, StrIntrinsicNode::LU, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareLU_evex(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2, + eAXRegI result, regD tmp1, kReg ktmp, eFlagsReg cr) %{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::LU, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} instruct string_compareUL(eSIRegP str1, eDXRegI cnt1, eDIRegP str2, eCXRegI cnt2, eAXRegI result, regD tmp1, eFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11794,7 +12071,22 @@ instruct string_compareUL(eSIRegP str1, eDXRegI cnt1, eDIRegP str2, eCXRegI cnt2 ins_encode %{ __ string_compare($str2$$Register, $str1$$Register, $cnt2$$Register, $cnt1$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::UL); + $tmp1$$XMMRegister, StrIntrinsicNode::UL, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareUL_evex(eSIRegP str1, eDXRegI cnt1, eDIRegP str2, eCXRegI cnt2, + eAXRegI result, regD tmp1, kReg ktmp, eFlagsReg cr) %{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str2$$Register, $str1$$Register, + $cnt2$$Register, $cnt1$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::UL, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11802,6 +12094,7 @@ instruct string_compareUL(eSIRegP str1, eDXRegI cnt1, eDIRegP str2, eCXRegI cnt2 // fast string equals instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result, regD tmp1, regD tmp2, eBXRegI tmp3, eFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set result (StrEquals (Binary str1 str2) cnt)); effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr); @@ -11809,12 +12102,29 @@ instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result, ins_encode %{ __ arrays_equals(false, $str1$$Register, $str2$$Register, $cnt$$Register, $result$$Register, $tmp3$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, knoreg); + %} + + ins_pipe( pipe_slow ); +%} + +instruct string_equals_evex(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result, + regD tmp1, regD tmp2, kReg ktmp, eBXRegI tmp3, eFlagsReg cr) %{ + predicate(UseAVX > 2); + match(Set result (StrEquals (Binary str1 str2) cnt)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr); + + format %{ "String Equals $str1,$str2,$cnt -> $result // KILL $tmp1, $tmp2, $tmp3" %} + ins_encode %{ + __ arrays_equals(false, $str1$$Register, $str2$$Register, + $cnt$$Register, $result$$Register, $tmp3$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} + // fast search of substring with known size. instruct string_indexof_conL(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, immI int_cnt2, eBXRegI result, regD vec1, eAXRegI cnt2, eCXRegI tmp, eFlagsReg cr) %{ @@ -11964,7 +12274,7 @@ instruct string_indexofU_char(eDIRegP str1, eDXRegI cnt1, eAXRegI ch, instruct array_equalsB(eDIRegP ary1, eSIRegP ary2, eAXRegI result, regD tmp1, regD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr) %{ - predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); + predicate(UseAVX <= 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); //ins_cost(300); @@ -11973,7 +12283,24 @@ instruct array_equalsB(eDIRegP ary1, eSIRegP ary2, eAXRegI result, ins_encode %{ __ arrays_equals(true, $ary1$$Register, $ary2$$Register, $tmp3$$Register, $result$$Register, $tmp4$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct array_equalsB_evex(eDIRegP ary1, eSIRegP ary2, eAXRegI result, + regD tmp1, regD tmp2, kReg ktmp, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); + match(Set result (AryEq ary1 ary2)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); + //ins_cost(300); + + format %{ "Array Equals byte[] $ary1,$ary2 -> $result // KILL $tmp1, $tmp2, $tmp3, $tmp4" %} + ins_encode %{ + __ arrays_equals(true, $ary1$$Register, $ary2$$Register, + $tmp3$$Register, $result$$Register, $tmp4$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11981,7 +12308,7 @@ instruct array_equalsB(eDIRegP ary1, eSIRegP ary2, eAXRegI result, instruct array_equalsC(eDIRegP ary1, eSIRegP ary2, eAXRegI result, regD tmp1, regD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr) %{ - predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); + predicate(UseAVX <= 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); //ins_cost(300); @@ -11990,7 +12317,24 @@ instruct array_equalsC(eDIRegP ary1, eSIRegP ary2, eAXRegI result, ins_encode %{ __ arrays_equals(true, $ary1$$Register, $ary2$$Register, $tmp3$$Register, $result$$Register, $tmp4$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct array_equalsC_evex(eDIRegP ary1, eSIRegP ary2, eAXRegI result, + regD tmp1, regD tmp2, kReg ktmp, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); + match(Set result (AryEq ary1 ary2)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); + //ins_cost(300); + + format %{ "Array Equals char[] $ary1,$ary2 -> $result // KILL $tmp1, $tmp2, $tmp3, $tmp4" %} + ins_encode %{ + __ arrays_equals(true, $ary1$$Register, $ary2$$Register, + $tmp3$$Register, $result$$Register, $tmp4$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11998,6 +12342,7 @@ instruct array_equalsC(eDIRegP ary1, eSIRegP ary2, eAXRegI result, instruct has_negatives(eSIRegP ary1, eCXRegI len, eAXRegI result, regD tmp1, regD tmp2, eBXRegI tmp3, eFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set result (HasNegatives ary1 len)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr); @@ -12005,14 +12350,32 @@ instruct has_negatives(eSIRegP ary1, eCXRegI len, eAXRegI result, ins_encode %{ __ has_negatives($ary1$$Register, $len$$Register, $result$$Register, $tmp3$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, knoreg, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct has_negatives_evex(eSIRegP ary1, eCXRegI len, eAXRegI result, + regD tmp1, regD tmp2, kReg ktmp1, kReg ktmp2, eBXRegI tmp3, eFlagsReg cr) +%{ + predicate(UseAVX > 2); + match(Set result (HasNegatives ary1 len)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp1, TEMP ktmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr); + + format %{ "has negatives byte[] $ary1,$len -> $result // KILL $tmp1, $tmp2, $tmp3" %} + ins_encode %{ + __ has_negatives($ary1$$Register, $len$$Register, + $result$$Register, $tmp3$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); %} ins_pipe( pipe_slow ); %} + // fast char[] to byte[] compression -instruct string_compress(eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, regD tmp2, regD tmp3, regD tmp4, - eCXRegI tmp5, eAXRegI result, eFlagsReg cr) %{ +instruct string_compress(eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, regD tmp2, + regD tmp3, regD tmp4, eCXRegI tmp5, eAXRegI result, eFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set result (StrCompressedCopy src (Binary dst len))); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr); @@ -12020,7 +12383,24 @@ instruct string_compress(eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, regD ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, - $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register); + $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register, + knoreg, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compress_evex(eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, regD tmp2, + regD tmp3, regD tmp4, kReg ktmp1, kReg ktmp2, eCXRegI tmp5, eAXRegI result, eFlagsReg cr) %{ + predicate(UseAVX > 2); + match(Set result (StrCompressedCopy src (Binary dst len))); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ktmp1, TEMP ktmp2, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr); + + format %{ "String Compress $src,$dst -> $result // KILL RAX, RCX, RDX" %} + ins_encode %{ + __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, + $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register, + $ktmp1$$KRegister, $ktmp2$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -12028,13 +12408,28 @@ instruct string_compress(eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, regD // fast byte[] to char[] inflation instruct string_inflate(Universe dummy, eSIRegP src, eDIRegP dst, eDXRegI len, regD tmp1, eCXRegI tmp2, eFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set dummy (StrInflatedCopy src (Binary dst len))); effect(TEMP tmp1, TEMP tmp2, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} ins_encode %{ __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$XMMRegister, $tmp2$$Register); + $tmp1$$XMMRegister, $tmp2$$Register, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_inflate_evex(Universe dummy, eSIRegP src, eDIRegP dst, eDXRegI len, + regD tmp1, kReg ktmp, eCXRegI tmp2, eFlagsReg cr) %{ + predicate(UseAVX > 2); + match(Set dummy (StrInflatedCopy src (Binary dst len))); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + + format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + ins_encode %{ + __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, + $tmp1$$XMMRegister, $tmp2$$Register, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -12462,10 +12857,12 @@ instruct jmpLoopEndUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{ // mask version // Jump Direct Conditional - Label defines a relative address from Jcc+1 -instruct jmpLoopEnd_and_restoreMask(cmpOp cop, eFlagsReg cr, label labl) %{ - predicate(n->has_vector_mask_set()); +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEnd_and_restoreMask(cmpOp cop, kReg_K1 ktmp, eFlagsReg cr, label labl) %{ + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cr); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(400); format %{ "J$cop $labl\t# Loop end\n\t" @@ -12474,16 +12871,18 @@ instruct jmpLoopEnd_and_restoreMask(cmpOp cop, eFlagsReg cr, label labl) %{ ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe( pipe_jcc ); %} // Jump Direct Conditional - Label defines a relative address from Jcc+1 -instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, eFlagsRegU cmp, label labl) %{ - predicate(n->has_vector_mask_set()); +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, kReg_K1 ktmp, eFlagsRegU cmp, label labl) %{ + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cmp); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(400); format %{ "J$cop,u $labl\t# Loop end\n\t" @@ -12492,15 +12891,17 @@ instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, eFlagsRegU cmp, label labl) %{ ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe( pipe_jcc ); %} -instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{ - predicate(n->has_vector_mask_set()); +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, kReg_K1 ktmp, eFlagsRegUCF cmp, label labl) %{ + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cmp); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(300); format %{ "J$cop,u $labl\t# Loop end\n\t" @@ -12509,7 +12910,7 @@ instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, eFlagsRegUCF cmp, label lab ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe( pipe_jcc ); %} diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index 8fac804e19b..9b8951ba905 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -1149,6 +1149,7 @@ int MachEpilogNode::safepoint_offset() const enum RC { rc_bad, rc_int, + rc_kreg, rc_float, rc_stack }; @@ -1163,6 +1164,8 @@ static enum RC rc_class(OptoReg::Name reg) if (r->is_Register()) return rc_int; + if (r->is_KRegister()) return rc_kreg; + assert(r->is_XMMRegister(), "must be"); return rc_float; } @@ -1276,7 +1279,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf, // Self copy, no move return 0; } - if (bottom_type()->isa_vect() != NULL) { + if (bottom_type()->isa_vect() != NULL && bottom_type()->isa_vectmask() == NULL) { uint ireg = ideal_reg(); assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity"); assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity"); @@ -1406,6 +1409,24 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf, st->print("movss %s, [rsp + #%d]\t# spill", Matcher::regName[dst_first], offset); +#endif + } + } + return 0; + } else if (dst_first_rc == rc_kreg) { + // mem -> kreg + if ((src_first & 1) == 0 && src_first + 1 == src_second && + (dst_first & 1) == 0 && dst_first + 1 == dst_second) { + // 64-bit + int offset = ra_->reg2offset(src_first); + if (cbuf) { + MacroAssembler _masm(cbuf); + __ kmov(as_KRegister(Matcher::_regEncode[dst_first]), Address(rsp, offset)); +#ifndef PRODUCT + } else { + st->print("kmovq %s, [rsp + #%d]\t# spill", + Matcher::regName[dst_first], + offset); #endif } } @@ -1511,6 +1532,23 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf, } } return 0; + } else if (dst_first_rc == rc_kreg) { + if ((src_first & 1) == 0 && src_first + 1 == src_second && + (dst_first & 1) == 0 && dst_first + 1 == dst_second) { + // 64-bit + if (cbuf) { + MacroAssembler _masm(cbuf); + __ kmov(as_KRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first])); + #ifndef PRODUCT + } else { + st->print("kmovq %s, %s\t# spill", + Matcher::regName[dst_first], + Matcher::regName[src_first]); + #endif + } + } + Unimplemented(); + return 0; } } else if (src_first_rc == rc_float) { // xmm -> @@ -1611,6 +1649,65 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf, } } return 0; + } else if (dst_first_rc == rc_kreg) { + assert(false, "Illegal spilling"); + return 0; + } + } else if (src_first_rc == rc_kreg) { + if (dst_first_rc == rc_stack) { + // mem -> kreg + if ((src_first & 1) == 0 && src_first + 1 == src_second && + (dst_first & 1) == 0 && dst_first + 1 == dst_second) { + // 64-bit + int offset = ra_->reg2offset(dst_first); + if (cbuf) { + MacroAssembler _masm(cbuf); + __ kmov(Address(rsp, offset), as_KRegister(Matcher::_regEncode[src_first])); +#ifndef PRODUCT + } else { + st->print("kmovq [rsp + #%d] , %s\t# spill", + offset, + Matcher::regName[src_first]); +#endif + } + } + return 0; + } else if (dst_first_rc == rc_int) { + if ((src_first & 1) == 0 && src_first + 1 == src_second && + (dst_first & 1) == 0 && dst_first + 1 == dst_second) { + // 64-bit + if (cbuf) { + MacroAssembler _masm(cbuf); + __ kmov(as_Register(Matcher::_regEncode[dst_first]), as_KRegister(Matcher::_regEncode[src_first])); +#ifndef PRODUCT + } else { + st->print("kmovq %s, %s\t# spill", + Matcher::regName[dst_first], + Matcher::regName[src_first]); +#endif + } + } + Unimplemented(); + return 0; + } else if (dst_first_rc == rc_kreg) { + if ((src_first & 1) == 0 && src_first + 1 == src_second && + (dst_first & 1) == 0 && dst_first + 1 == dst_second) { + // 64-bit + if (cbuf) { + MacroAssembler _masm(cbuf); + __ kmov(as_KRegister(Matcher::_regEncode[dst_first]), as_KRegister(Matcher::_regEncode[src_first])); +#ifndef PRODUCT + } else { + st->print("kmovq %s, %s\t# spill", + Matcher::regName[dst_first], + Matcher::regName[src_first]); +#endif + } + } + return 0; + } else if (dst_first_rc == rc_float) { + assert(false, "Illegal spill"); + return 0; } } @@ -2982,7 +3079,7 @@ frame RAX_H_num // Op_RegL }; // Excluded flags and vector registers. - assert(ARRAY_SIZE(hi) == _last_machine_leaf - 6, "missing type"); + assert(ARRAY_SIZE(hi) == _last_machine_leaf - 7, "missing type"); return OptoRegPair(hi[ideal_reg], lo[ideal_reg]); %} %} @@ -3428,6 +3525,72 @@ operand immL_65535() interface(CONST_INTER); %} +operand kReg() +%{ + constraint(ALLOC_IN_RC(vectmask_reg)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K1() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K1)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K2() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K2)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +// Special Registers +operand kReg_K3() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K3)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K4() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K4)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K5() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K5)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +operand kReg_K6() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K6)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + +// Special Registers +operand kReg_K7() +%{ + constraint(ALLOC_IN_RC(vectmask_reg_K7)); + match(RegVectMask); + format %{%} + interface(REG_INTER); +%} + // Register Operands // Integer Register operand rRegI() @@ -4901,7 +5064,6 @@ define // name must have been defined in an 'enc_class' specification // in the encode section of the architecture description. - //----------Load/Store/Move Instructions--------------------------------------- //----------Load Instructions-------------------------------------------------- @@ -11073,13 +11235,13 @@ instruct MoveL2D_reg_reg(regD dst, rRegL src) %{ ins_pipe( pipe_slow ); %} - -// ======================================================================= -// fast clearing of an array +// Fast clearing of an array +// Small ClearArray non-AVX512. instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(!((ClearArrayNode*)n)->is_large()); + predicate(!((ClearArrayNode*)n)->is_large() && + (UseAVX <= 2 || !VM_Version::supports_avx512vlbw())); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); @@ -11130,15 +11292,78 @@ instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, %} ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, - $tmp$$XMMRegister, false); + $tmp$$XMMRegister, false, knoreg); %} ins_pipe(pipe_slow); %} -instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, +// Small ClearArray AVX512 non-constant length. +instruct rep_stos_evex(rcx_RegL cnt, rdi_RegP base, regD tmp, kReg ktmp, rax_RegI zero, + Universe dummy, rFlagsReg cr) +%{ + predicate(!((ClearArrayNode*)n)->is_large() && + UseAVX > 2 && VM_Version::supports_avx512vlbw() && + !n->in(2)->bottom_type()->is_long()->is_con()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr); + + format %{ $$template + $$emit$$"xorq rax, rax\t# ClearArray:\n\t" + $$emit$$"cmp InitArrayShortSize,rcx\n\t" + $$emit$$"jg LARGE\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"js DONE\t# Zero length\n\t" + $$emit$$"mov rax,(rdi,rcx,8)\t# LOOP\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"jge LOOP\n\t" + $$emit$$"jmp DONE\n\t" + $$emit$$"# LARGE:\n\t" + if (UseFastStosb) { + $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t" + $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"mov rdi,rax\n\t" + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t" + $$emit$$"jmpq L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t" + $$emit$$"add 0x40,rax\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"sub 0x8,rcx\n\t" + $$emit$$"jge L_loop\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jl L_tail\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"add 0x20,rax\n\t" + $$emit$$"sub 0x4,rcx\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jle L_end\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"vmovq xmm0,(rax)\n\t" + $$emit$$"add 0x8,rax\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"jge L_sloop\n\t" + $$emit$$"# L_end:\n\t" + } else { + $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t" + } + $$emit$$"# DONE" + %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, false, $ktmp$$KRegister); + %} + ins_pipe(pipe_slow); +%} + +// Large ClearArray non-AVX512. +instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, Universe dummy, rFlagsReg cr) %{ - predicate(((ClearArrayNode*)n)->is_large()); + predicate(UseAVX <=2 && ((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); @@ -11179,8 +11404,74 @@ instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, } %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, - $tmp$$XMMRegister, true); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, true, knoreg); + %} + ins_pipe(pipe_slow); +%} + +// Large ClearArray AVX512. +instruct rep_stos_large_evex(rcx_RegL cnt, rdi_RegP base, regD tmp, kReg ktmp, rax_RegI zero, + Universe dummy, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((ClearArrayNode*)n)->is_large()); + match(Set dummy (ClearArray cnt base)); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr); + + format %{ $$template + if (UseFastStosb) { + $$emit$$"xorq rax, rax\t# ClearArray:\n\t" + $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t" + $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--" + } else if (UseXMMForObjInit) { + $$emit$$"mov rdi,rax\t# ClearArray:\n\t" + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t" + $$emit$$"jmpq L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t" + $$emit$$"add 0x40,rax\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"sub 0x8,rcx\n\t" + $$emit$$"jge L_loop\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jl L_tail\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"add 0x20,rax\n\t" + $$emit$$"sub 0x4,rcx\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jle L_end\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"vmovq xmm0,(rax)\n\t" + $$emit$$"add 0x8,rax\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"jge L_sloop\n\t" + $$emit$$"# L_end:\n\t" + } else { + $$emit$$"xorq rax, rax\t# ClearArray:\n\t" + $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--" + } + %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, true, $ktmp$$KRegister); + %} + ins_pipe(pipe_slow); +%} + +// Small ClearArray AVX512 constant length. +instruct rep_stos_im(immL cnt, rRegP base, regD tmp, rRegI zero, kReg ktmp, Universe dummy, rFlagsReg cr) +%{ + predicate(!((ClearArrayNode*)n)->is_large() && + (UseAVX > 2 && VM_Version::supports_avx512vlbw() && + n->in(2)->bottom_type()->is_long()->is_con())); + match(Set dummy (ClearArray cnt base)); + effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr); + format %{ "clear_mem_imm $base , $cnt \n\t" %} + ins_encode %{ + __ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister); %} ins_pipe(pipe_slow); %} @@ -11188,7 +11479,7 @@ instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, rax_RegI result, legRegD tmp1, rFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11196,7 +11487,23 @@ instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI c ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::LL); + $tmp1$$XMMRegister, StrIntrinsicNode::LL, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareL_evex(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, + rax_RegI result, legRegD tmp1, kReg ktmp, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::LL, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11204,7 +11511,7 @@ instruct string_compareL(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI c instruct string_compareU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, rax_RegI result, legRegD tmp1, rFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11212,7 +11519,23 @@ instruct string_compareU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI c ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::UU); + $tmp1$$XMMRegister, StrIntrinsicNode::UU, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareU_evex(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, + rax_RegI result, legRegD tmp1, kReg ktmp, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare char[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::UU, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11220,7 +11543,7 @@ instruct string_compareU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI c instruct string_compareLU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, rax_RegI result, legRegD tmp1, rFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11228,7 +11551,23 @@ instruct string_compareLU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI ins_encode %{ __ string_compare($str1$$Register, $str2$$Register, $cnt1$$Register, $cnt2$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::LU); + $tmp1$$XMMRegister, StrIntrinsicNode::LU, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareLU_evex(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI cnt2, + rax_RegI result, legRegD tmp1, kReg ktmp, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str1$$Register, $str2$$Register, + $cnt1$$Register, $cnt2$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::LU, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11236,7 +11575,7 @@ instruct string_compareLU(rdi_RegP str1, rcx_RegI cnt1, rsi_RegP str2, rdx_RegI instruct string_compareUL(rsi_RegP str1, rdx_RegI cnt1, rdi_RegP str2, rcx_RegI cnt2, rax_RegI result, legRegD tmp1, rFlagsReg cr) %{ - predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); + predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); @@ -11244,7 +11583,23 @@ instruct string_compareUL(rsi_RegP str1, rdx_RegI cnt1, rdi_RegP str2, rcx_RegI ins_encode %{ __ string_compare($str2$$Register, $str1$$Register, $cnt2$$Register, $cnt1$$Register, $result$$Register, - $tmp1$$XMMRegister, StrIntrinsicNode::UL); + $tmp1$$XMMRegister, StrIntrinsicNode::UL, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_compareUL_evex(rsi_RegP str1, rdx_RegI cnt1, rdi_RegP str2, rcx_RegI cnt2, + rax_RegI result, legRegD tmp1, kReg ktmp, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL); + match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2))); + effect(TEMP tmp1, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr); + + format %{ "String Compare byte[] $str1,$cnt1,$str2,$cnt2 -> $result // KILL $tmp1" %} + ins_encode %{ + __ string_compare($str2$$Register, $str1$$Register, + $cnt2$$Register, $cnt1$$Register, $result$$Register, + $tmp1$$XMMRegister, StrIntrinsicNode::UL, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11405,6 +11760,7 @@ instruct string_indexofU_char(rdi_RegP str1, rdx_RegI cnt1, rax_RegI ch, instruct string_equals(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI result, legRegD tmp1, legRegD tmp2, rbx_RegI tmp3, rFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set result (StrEquals (Binary str1 str2) cnt)); effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr); @@ -11412,7 +11768,23 @@ instruct string_equals(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI resu ins_encode %{ __ arrays_equals(false, $str1$$Register, $str2$$Register, $cnt$$Register, $result$$Register, $tmp3$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_equals_evex(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI result, + legRegD tmp1, legRegD tmp2, kReg ktmp, rbx_RegI tmp3, rFlagsReg cr) +%{ + predicate(UseAVX > 2); + match(Set result (StrEquals (Binary str1 str2) cnt)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr); + + format %{ "String Equals $str1,$str2,$cnt -> $result // KILL $tmp1, $tmp2, $tmp3" %} + ins_encode %{ + __ arrays_equals(false, $str1$$Register, $str2$$Register, + $cnt$$Register, $result$$Register, $tmp3$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11421,7 +11793,7 @@ instruct string_equals(rdi_RegP str1, rsi_RegP str2, rcx_RegI cnt, rax_RegI resu instruct array_equalsB(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, legRegD tmp1, legRegD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr) %{ - predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); + predicate(UseAVX <= 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); @@ -11429,7 +11801,23 @@ instruct array_equalsB(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, ins_encode %{ __ arrays_equals(true, $ary1$$Register, $ary2$$Register, $tmp3$$Register, $result$$Register, $tmp4$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct array_equalsB_evex(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, + legRegD tmp1, legRegD tmp2, kReg ktmp, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); + match(Set result (AryEq ary1 ary2)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); + + format %{ "Array Equals byte[] $ary1,$ary2 -> $result // KILL $tmp1, $tmp2, $tmp3, $tmp4" %} + ins_encode %{ + __ arrays_equals(true, $ary1$$Register, $ary2$$Register, + $tmp3$$Register, $result$$Register, $tmp4$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, false /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -11437,7 +11825,7 @@ instruct array_equalsB(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, instruct array_equalsC(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, legRegD tmp1, legRegD tmp2, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr) %{ - predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); + predicate(UseAVX <= 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); @@ -11445,14 +11833,31 @@ instruct array_equalsC(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, ins_encode %{ __ arrays_equals(true, $ary1$$Register, $ary2$$Register, $tmp3$$Register, $result$$Register, $tmp4$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct array_equalsC_evex(rdi_RegP ary1, rsi_RegP ary2, rax_RegI result, + legRegD tmp1, legRegD tmp2, kReg ktmp, rcx_RegI tmp3, rbx_RegI tmp4, rFlagsReg cr) +%{ + predicate(UseAVX > 2 && ((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); + match(Set result (AryEq ary1 ary2)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr); + + format %{ "Array Equals char[] $ary1,$ary2 -> $result // KILL $tmp1, $tmp2, $tmp3, $tmp4" %} + ins_encode %{ + __ arrays_equals(true, $ary1$$Register, $ary2$$Register, + $tmp3$$Register, $result$$Register, $tmp4$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, true /* char */, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} instruct has_negatives(rsi_RegP ary1, rcx_RegI len, rax_RegI result, - legRegD tmp1, legRegD tmp2, rbx_RegI tmp3, rFlagsReg cr) + legRegD tmp1, legRegD tmp2, rbx_RegI tmp3, rFlagsReg cr,) %{ + predicate(UseAVX <= 2); match(Set result (HasNegatives ary1 len)); effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr); @@ -11460,36 +11865,86 @@ instruct has_negatives(rsi_RegP ary1, rcx_RegI len, rax_RegI result, ins_encode %{ __ has_negatives($ary1$$Register, $len$$Register, $result$$Register, $tmp3$$Register, - $tmp1$$XMMRegister, $tmp2$$XMMRegister); + $tmp1$$XMMRegister, $tmp2$$XMMRegister, knoreg, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct has_negatives_evex(rsi_RegP ary1, rcx_RegI len, rax_RegI result, + legRegD tmp1, legRegD tmp2, kReg ktmp1, kReg ktmp2, rbx_RegI tmp3, rFlagsReg cr,) +%{ + predicate(UseAVX > 2); + match(Set result (HasNegatives ary1 len)); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp1, TEMP ktmp2, USE_KILL ary1, USE_KILL len, KILL tmp3, KILL cr); + + format %{ "has negatives byte[] $ary1,$len -> $result // KILL $tmp1, $tmp2, $tmp3" %} + ins_encode %{ + __ has_negatives($ary1$$Register, $len$$Register, + $result$$Register, $tmp3$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister); %} ins_pipe( pipe_slow ); %} // fast char[] to byte[] compression -instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, legRegD tmp1, legRegD tmp2, legRegD tmp3, legRegD tmp4, - rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{ +instruct string_compress(rsi_RegP src, rdi_RegP dst, rdx_RegI len, legRegD tmp1, legRegD tmp2, legRegD tmp3, + legRegD tmp4, rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, + USE_KILL len, KILL tmp5, KILL cr); format %{ "String Compress $src,$dst -> $result // KILL RAX, RCX, RDX" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, - $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register); + $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register, + knoreg, knoreg); %} ins_pipe( pipe_slow ); %} +instruct string_compress_evex(rsi_RegP src, rdi_RegP dst, rdx_RegI len, legRegD tmp1, legRegD tmp2, legRegD tmp3, + legRegD tmp4, kReg ktmp1, kReg ktmp2, rcx_RegI tmp5, rax_RegI result, rFlagsReg cr) %{ + predicate(UseAVX > 2); + match(Set result (StrCompressedCopy src (Binary dst len))); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP ktmp1, TEMP ktmp2, USE_KILL src, USE_KILL dst, + USE_KILL len, KILL tmp5, KILL cr); + + format %{ "String Compress $src,$dst -> $result // KILL RAX, RCX, RDX" %} + ins_encode %{ + __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, + $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, + $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register, + $ktmp1$$KRegister, $ktmp2$$KRegister); + %} + ins_pipe( pipe_slow ); +%} // fast byte[] to char[] inflation instruct string_inflate(Universe dummy, rsi_RegP src, rdi_RegP dst, rdx_RegI len, legRegD tmp1, rcx_RegI tmp2, rFlagsReg cr) %{ + predicate(UseAVX <= 2); match(Set dummy (StrInflatedCopy src (Binary dst len))); effect(TEMP tmp1, TEMP tmp2, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} ins_encode %{ __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$XMMRegister, $tmp2$$Register); + $tmp1$$XMMRegister, $tmp2$$Register, knoreg); + %} + ins_pipe( pipe_slow ); +%} + +instruct string_inflate_evex(Universe dummy, rsi_RegP src, rdi_RegP dst, rdx_RegI len, + legRegD tmp1, kReg ktmp, rcx_RegI tmp2, rFlagsReg cr) %{ + predicate(UseAVX > 2); + match(Set dummy (StrInflatedCopy src (Binary dst len))); + effect(TEMP tmp1, TEMP tmp2, TEMP ktmp, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + + format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + ins_encode %{ + __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, + $tmp1$$XMMRegister, $tmp2$$Register, $ktmp$$KRegister); %} ins_pipe( pipe_slow ); %} @@ -12271,11 +12726,13 @@ instruct jmpLoopEndUCF(cmpOpUCF cop, rFlagsRegUCF cmp, label labl) %{ // mask version // Jump Direct Conditional - Label defines a relative address from Jcc+1 -instruct jmpLoopEnd_and_restoreMask(cmpOp cop, rFlagsReg cr, label labl) +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEnd_and_restoreMask(cmpOp cop, kReg_K1 ktmp, rFlagsReg cr, label labl) %{ - predicate(n->has_vector_mask_set()); + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cr); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(400); format %{ "j$cop $labl\t# loop end\n\t" @@ -12284,16 +12741,18 @@ instruct jmpLoopEnd_and_restoreMask(cmpOp cop, rFlagsReg cr, label labl) ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe(pipe_jcc); %} // Jump Direct Conditional - Label defines a relative address from Jcc+1 -instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, rFlagsRegU cmp, label labl) %{ - predicate(n->has_vector_mask_set()); +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, kReg_K1 ktmp, rFlagsRegU cmp, label labl) %{ + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cmp); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(400); format %{ "j$cop,u $labl\t# loop end\n\t" @@ -12302,15 +12761,17 @@ instruct jmpLoopEndU_and_restoreMask(cmpOpU cop, rFlagsRegU cmp, label labl) %{ ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe(pipe_jcc); %} -instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, rFlagsRegUCF cmp, label labl) %{ - predicate(n->has_vector_mask_set()); +// Bounded mask operand used in following patten is needed for +// post-loop multiversioning. +instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, kReg_K1 ktmp, rFlagsRegUCF cmp, label labl) %{ + predicate(PostLoopMultiversioning && n->has_vector_mask_set()); match(CountedLoopEnd cop cmp); - effect(USE labl); + effect(USE labl, TEMP ktmp); ins_cost(300); format %{ "j$cop,u $labl\t# loop end\n\t" @@ -12319,7 +12780,7 @@ instruct jmpLoopEndUCF_and_restoreMask(cmpOpUCF cop, rFlagsRegUCF cmp, label lab ins_encode %{ Label* L = $labl$$label; __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump - __ restorevectmask(); + __ restorevectmask($ktmp$$KRegister); %} ins_pipe(pipe_jcc); %} diff --git a/src/hotspot/share/adlc/archDesc.cpp b/src/hotspot/share/adlc/archDesc.cpp index 4a0a1461571..7e5ea8b850b 100644 --- a/src/hotspot/share/adlc/archDesc.cpp +++ b/src/hotspot/share/adlc/archDesc.cpp @@ -1,5 +1,5 @@ // -// Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // // This code is free software; you can redistribute it and/or modify it @@ -944,6 +944,10 @@ const char *ArchDesc::getIdealType(const char *idealOp) { } } + if (strncmp(idealOp, "RegVectMask", 8) == 0) { + return "TypeVect::VECTMASK"; + } + // !!!!! switch(last_char) { case 'I': return "TypeInt::INT"; diff --git a/src/hotspot/share/adlc/forms.cpp b/src/hotspot/share/adlc/forms.cpp index 7037c54cc7e..f8038dc4c32 100644 --- a/src/hotspot/share/adlc/forms.cpp +++ b/src/hotspot/share/adlc/forms.cpp @@ -267,6 +267,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const { if( strcmp(opType,"LoadS")==0 ) return Form::idealS; if( strcmp(opType,"LoadVector")==0 ) return Form::idealV; if( strcmp(opType,"LoadVectorGather")==0 ) return Form::idealV; + if( strcmp(opType,"LoadVectorMasked")==0 ) return Form::idealV; assert( strcmp(opType,"Load") != 0, "Must type Loads" ); return Form::none; } @@ -284,6 +285,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const { if( strcmp(opType,"StoreNKlass")==0) return Form::idealNKlass; if( strcmp(opType,"StoreVector")==0 ) return Form::idealV; if( strcmp(opType,"StoreVectorScatter")==0 ) return Form::idealV; + if( strcmp(opType,"StoreVectorMasked")==0 ) return Form::idealV; assert( strcmp(opType,"Store") != 0, "Must type Stores" ); return Form::none; } diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index 3c0a9267e51..ec66d433a9a 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -779,6 +779,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const { !strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") || #endif !strcmp(_matrule->_rChild->_opType,"StrInflatedCopy") || + !strcmp(_matrule->_rChild->_opType,"VectorMaskGen")|| !strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") || !strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true; else if ( is_ideal_load() == Form::idealP ) return true; @@ -3511,7 +3512,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const { "StoreB","StoreC","Store" ,"StoreFP", "LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF" , "LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" , - "StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter", + "StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter", "LoadVectorMasked", "StoreVectorMasked", "LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned", "LoadPLocked", "StorePConditional", "StoreIConditional", "StoreLConditional", @@ -3964,6 +3965,7 @@ bool MatchRule::is_base_register(FormDict &globals) const { strcmp(opType,"RegL")==0 || strcmp(opType,"RegF")==0 || strcmp(opType,"RegD")==0 || + strcmp(opType,"RegVectMask")==0 || strcmp(opType,"VecS")==0 || strcmp(opType,"VecD")==0 || strcmp(opType,"VecX")==0 || @@ -4195,7 +4197,7 @@ bool MatchRule::is_vector() const { "VectorRearrange","VectorLoadShuffle", "VectorLoadConst", "VectorCastB2X", "VectorCastS2X", "VectorCastI2X", "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", - "VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret", + "VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret","LoadVectorMasked","StoreVectorMasked", "FmaVD", "FmaVF","PopCountVI", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", diff --git a/src/hotspot/share/adlc/output_c.cpp b/src/hotspot/share/adlc/output_c.cpp index 41b2102a791..adeae0894ae 100644 --- a/src/hotspot/share/adlc/output_c.cpp +++ b/src/hotspot/share/adlc/output_c.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -2272,6 +2272,7 @@ class DefineEmitState { // and return the conversion function to build them from OptoReg const char* reg_conversion(const char* rep_var) { if (strcmp(rep_var,"$Register") == 0) return "as_Register"; + if (strcmp(rep_var,"$KRegister") == 0) return "as_KRegister"; if (strcmp(rep_var,"$FloatRegister") == 0) return "as_FloatRegister"; #if defined(IA32) || defined(AMD64) if (strcmp(rep_var,"$XMMRegister") == 0) return "as_XMMRegister"; diff --git a/src/hotspot/share/opto/chaitin.cpp b/src/hotspot/share/opto/chaitin.cpp index 50005421849..ea612285ee3 100644 --- a/src/hotspot/share/opto/chaitin.cpp +++ b/src/hotspot/share/opto/chaitin.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -803,7 +803,8 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { // processes as vector in RA. if (RegMask::is_vector(ireg)) lrg._is_vector = 1; - assert(n_type->isa_vect() == NULL || lrg._is_vector || ireg == Op_RegD || ireg == Op_RegL, + assert(n_type->isa_vect() == NULL || lrg._is_vector || + ireg == Op_RegD || ireg == Op_RegL || ireg == Op_RegVectMask, "vector must be in vector registers"); // Check for bound register masks @@ -900,6 +901,10 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { lrg._is_bound = 1; } break; + case Op_RegVectMask: + lrg.set_num_regs(RegMask::SlotsPerRegVectMask); + lrg.set_reg_pressure(1); + break; case Op_RegF: case Op_RegI: case Op_RegN: @@ -1016,8 +1021,8 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) { const RegMask &lrgmask = lrg.mask(); uint kreg = n->in(k)->ideal_reg(); bool is_vect = RegMask::is_vector(kreg); - assert(n->in(k)->bottom_type()->isa_vect() == NULL || - is_vect || kreg == Op_RegD || kreg == Op_RegL, + assert(n->in(k)->bottom_type()->isa_vect() == NULL || is_vect || + kreg == Op_RegD || kreg == Op_RegL || kreg == Op_RegVectMask, "vector must be in vector registers"); if (lrgmask.is_bound(kreg)) lrg._is_bound = 1; diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index c95115f25ac..37aecbf0899 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -394,6 +394,9 @@ macro(LoadVector) macro(LoadVectorGather) macro(StoreVector) macro(StoreVectorScatter) +macro(LoadVectorMasked) +macro(StoreVectorMasked) +macro(VectorMaskGen) macro(Pack) macro(PackB) macro(PackS) diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp index 928596e0f07..542ee7ac65e 100644 --- a/src/hotspot/share/opto/compile.cpp +++ b/src/hotspot/share/opto/compile.cpp @@ -2546,7 +2546,7 @@ static bool is_vector_bitwise_op(Node* n) { } static bool is_vector_bitwise_cone_root(Node* n) { - if (!is_vector_bitwise_op(n)) { + if (n->bottom_type()->isa_vectmask() || !is_vector_bitwise_op(n)) { return false; } for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { @@ -3743,6 +3743,9 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) { case Op_StoreVector: case Op_LoadVectorGather: case Op_StoreVectorScatter: + case Op_VectorMaskGen: + case Op_LoadVectorMasked: + case Op_StoreVectorMasked: break; case Op_AddReductionVI: diff --git a/src/hotspot/share/opto/ifg.cpp b/src/hotspot/share/opto/ifg.cpp index 0e92b8e777f..b19e9f57e67 100644 --- a/src/hotspot/share/opto/ifg.cpp +++ b/src/hotspot/share/opto/ifg.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -402,7 +402,9 @@ uint PhaseChaitin::count_int_pressure(IndexSet* liveout) { LRG& lrg = lrgs(lidx); if (lrg.mask_is_nonempty_and_up() && !lrg.is_float_or_vector() && - lrg.mask().overlap(*Matcher::idealreg2regmask[Op_RegI])) { + (lrg.mask().overlap(*Matcher::idealreg2regmask[Op_RegI]) || + (Matcher::has_predicated_vectors() && + lrg.mask().overlap(*Matcher::idealreg2regmask[Op_RegVectMask])))) { cnt += lrg.reg_pressure(); } lidx = elements.next(); @@ -435,7 +437,9 @@ void PhaseChaitin::lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* l } else { // Do not count the SP and flag registers const RegMask& r = lrg.mask(); - if (r.overlap(*Matcher::idealreg2regmask[Op_RegI])) { + if (r.overlap(*Matcher::idealreg2regmask[Op_RegI]) || + (Matcher::has_predicated_vectors() && + r.overlap(*Matcher::idealreg2regmask[Op_RegVectMask]))) { int_pressure.lower(lrg, location); } } @@ -490,7 +494,9 @@ void PhaseChaitin::raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pr } else { // Do not count the SP and flag registers const RegMask& rm = lrg.mask(); - if (rm.overlap(*Matcher::idealreg2regmask[Op_RegI])) { + if (rm.overlap(*Matcher::idealreg2regmask[Op_RegI]) || + (Matcher::has_predicated_vectors() && + rm.overlap(*Matcher::idealreg2regmask[Op_RegVectMask]))) { int_pressure.raise(lrg); } } diff --git a/src/hotspot/share/opto/lcm.cpp b/src/hotspot/share/opto/lcm.cpp index 6a6105faf53..86845270e60 100644 --- a/src/hotspot/share/opto/lcm.cpp +++ b/src/hotspot/share/opto/lcm.cpp @@ -718,6 +718,7 @@ void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_ case Op_StoreN: case Op_StoreVector: case Op_StoreVectorScatter: + case Op_StoreVectorMasked: case Op_StoreNKlass: for (uint k = 1; k < m->req(); k++) { Node *in = m->in(k); diff --git a/src/hotspot/share/opto/machnode.hpp b/src/hotspot/share/opto/machnode.hpp index 8fddefcc853..f023a6d230e 100644 --- a/src/hotspot/share/opto/machnode.hpp +++ b/src/hotspot/share/opto/machnode.hpp @@ -100,6 +100,12 @@ class MachOper : public ResourceObj { } #if defined(IA32) || defined(AMD64) + KRegister as_KRegister(PhaseRegAlloc *ra_, const Node *node) const { + return ::as_KRegister(reg(ra_, node)); + } + KRegister as_KRegister(PhaseRegAlloc *ra_, const Node *node, int idx) const { + return ::as_KRegister(reg(ra_, node, idx)); + } XMMRegister as_XMMRegister(PhaseRegAlloc *ra_, const Node *node) const { return ::as_XMMRegister(reg(ra_, node)); } diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index cb9f16b82c9..60e7fa16ca7 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -87,6 +87,7 @@ Matcher::Matcher() idealreg2spillmask [Op_VecY] = NULL; idealreg2spillmask [Op_VecZ] = NULL; idealreg2spillmask [Op_RegFlags] = NULL; + idealreg2spillmask [Op_RegVectMask] = NULL; idealreg2debugmask [Op_RegI] = NULL; idealreg2debugmask [Op_RegN] = NULL; @@ -100,6 +101,7 @@ Matcher::Matcher() idealreg2debugmask [Op_VecY] = NULL; idealreg2debugmask [Op_VecZ] = NULL; idealreg2debugmask [Op_RegFlags] = NULL; + idealreg2debugmask [Op_RegVectMask] = NULL; idealreg2mhdebugmask[Op_RegI] = NULL; idealreg2mhdebugmask[Op_RegN] = NULL; @@ -113,6 +115,7 @@ Matcher::Matcher() idealreg2mhdebugmask[Op_VecY] = NULL; idealreg2mhdebugmask[Op_VecZ] = NULL; idealreg2mhdebugmask[Op_RegFlags] = NULL; + idealreg2mhdebugmask[Op_RegVectMask] = NULL; debug_only(_mem_node = NULL;) // Ideal memory node consumed by mach node } @@ -427,7 +430,7 @@ static RegMask *init_input_masks( uint size, RegMask &ret_adr, RegMask &fp ) { void Matcher::init_first_stack_mask() { // Allocate storage for spill masks as masks for the appropriate load type. - RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*11)); + RegMask *rms = (RegMask*)C->comp_arena()->Amalloc_D(sizeof(RegMask) * (3*12)); idealreg2spillmask [Op_RegN] = &rms[0]; idealreg2spillmask [Op_RegI] = &rms[1]; @@ -468,6 +471,10 @@ void Matcher::init_first_stack_mask() { idealreg2mhdebugmask[Op_VecY] = &rms[31]; idealreg2mhdebugmask[Op_VecZ] = &rms[32]; + idealreg2spillmask [Op_RegVectMask] = &rms[33]; + idealreg2debugmask [Op_RegVectMask] = &rms[34]; + idealreg2mhdebugmask[Op_RegVectMask] = &rms[35]; + OptoReg::Name i; // At first, start with the empty mask @@ -511,6 +518,11 @@ void Matcher::init_first_stack_mask() { *idealreg2spillmask[Op_RegD] = *idealreg2regmask[Op_RegD]; idealreg2spillmask[Op_RegD]->OR(aligned_stack_mask); + if (Matcher::has_predicated_vectors()) { + *idealreg2spillmask[Op_RegVectMask] = *idealreg2regmask[Op_RegVectMask]; + idealreg2spillmask[Op_RegVectMask]->OR(aligned_stack_mask); + } + if (Matcher::vector_size_supported(T_BYTE,4)) { *idealreg2spillmask[Op_VecS] = *idealreg2regmask[Op_VecS]; idealreg2spillmask[Op_VecS]->OR(C->FIRST_STACK_mask()); @@ -609,6 +621,7 @@ void Matcher::init_first_stack_mask() { *idealreg2debugmask [Op_RegF] = *idealreg2spillmask[Op_RegF]; *idealreg2debugmask [Op_RegD] = *idealreg2spillmask[Op_RegD]; *idealreg2debugmask [Op_RegP] = *idealreg2spillmask[Op_RegP]; + *idealreg2debugmask [Op_RegVectMask] = *idealreg2spillmask[Op_RegVectMask]; *idealreg2debugmask [Op_VecS] = *idealreg2spillmask[Op_VecS]; *idealreg2debugmask [Op_VecD] = *idealreg2spillmask[Op_VecD]; @@ -622,6 +635,7 @@ void Matcher::init_first_stack_mask() { *idealreg2mhdebugmask[Op_RegF] = *idealreg2spillmask[Op_RegF]; *idealreg2mhdebugmask[Op_RegD] = *idealreg2spillmask[Op_RegD]; *idealreg2mhdebugmask[Op_RegP] = *idealreg2spillmask[Op_RegP]; + *idealreg2mhdebugmask[Op_RegVectMask] = *idealreg2spillmask[Op_RegVectMask]; *idealreg2mhdebugmask[Op_VecS] = *idealreg2spillmask[Op_VecS]; *idealreg2mhdebugmask[Op_VecD] = *idealreg2spillmask[Op_VecD]; @@ -644,6 +658,7 @@ void Matcher::init_first_stack_mask() { idealreg2debugmask [Op_RegF]->Remove(i); // masks idealreg2debugmask [Op_RegD]->Remove(i); idealreg2debugmask [Op_RegP]->Remove(i); + idealreg2debugmask [Op_RegVectMask]->Remove(i); idealreg2debugmask [Op_VecS]->Remove(i); idealreg2debugmask [Op_VecD]->Remove(i); @@ -657,6 +672,7 @@ void Matcher::init_first_stack_mask() { idealreg2mhdebugmask[Op_RegF]->Remove(i); idealreg2mhdebugmask[Op_RegD]->Remove(i); idealreg2mhdebugmask[Op_RegP]->Remove(i); + idealreg2mhdebugmask[Op_RegVectMask]->Remove(i); idealreg2mhdebugmask[Op_VecS]->Remove(i); idealreg2mhdebugmask[Op_VecD]->Remove(i); @@ -923,6 +939,7 @@ void Matcher::init_spill_mask( Node *ret ) { idealreg2regmask[Op_VecX] = regmask_for_ideal_register(Op_VecX, ret); idealreg2regmask[Op_VecY] = regmask_for_ideal_register(Op_VecY, ret); idealreg2regmask[Op_VecZ] = regmask_for_ideal_register(Op_VecZ, ret); + idealreg2regmask[Op_RegVectMask] = regmask_for_ideal_register(Op_RegVectMask, ret); } #ifdef ASSERT @@ -2209,6 +2226,7 @@ void Matcher::find_shared( Node *n ) { case Op_FmaVD: case Op_FmaVF: case Op_MacroLogicV: + case Op_LoadVectorMasked: case Op_ThreadRefetch: // This must be added, otherwise we couldn't match the ThreadRefetchNode. set_shared(n); // Force result into register (it will be anyways) break; @@ -2379,6 +2397,12 @@ void Matcher::find_shared( Node *n ) { n->del_req(3); break; } + case Op_StoreVectorMasked: { + Node* pair = new BinaryNode(n->in(3), n->in(4)); + n->set_req(3, pair); + n->del_req(4); + break; + } case Op_LoopLimit: { Node *pair1 = new BinaryNode(n->in(1),n->in(2)); n->set_req(1,pair1); @@ -2597,6 +2621,7 @@ const RegMask* Matcher::regmask_for_ideal_register(uint ideal_reg, Node* ret) { case Op_VecX: // fall-through case Op_VecY: // fall-through case Op_VecZ: spill = new LoadVectorNode(NULL, mem, fp, atp, t->is_vect()); break; + case Op_RegVectMask: return Matcher::predicate_reg_mask(); default: ShouldNotReachHere(); spill = NULL; } diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 839c1f5275c..b0d36c7b5d2 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -316,6 +316,8 @@ class Matcher : public PhaseTransform { // Some microarchitectures have mask registers used on vectors static const bool has_predicated_vectors(void); + static const RegMask* predicate_reg_mask(void); + static const TypeVect* predicate_reg_type(const Type* elemTy, int length); // Some uarchs have different sized float register resources static const int float_pressure(int default_pressure_threshold); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 63b707bca3c..1d72cd479fc 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -157,6 +157,8 @@ class TypeNode; class UnlockNode; class VectorNode; class LoadVectorNode; +class LoadVectorMaskedNode; +class StoreVectorMaskedNode; class LoadVectorGatherNode; class StoreVectorNode; class StoreVectorScatterNode; @@ -686,6 +688,8 @@ class Node { DEFINE_CLASS_ID(EncodeNarrowPtr, Type, 6) DEFINE_CLASS_ID(EncodeP, EncodeNarrowPtr, 0) DEFINE_CLASS_ID(EncodePKlass, EncodeNarrowPtr, 1) + DEFINE_CLASS_ID(Vector, Type, 7) + DEFINE_CLASS_ID(VectorMaskCmp, Vector, 0) #if INCLUDE_SHENANDOAHGC DEFINE_CLASS_ID(ShenandoahBarrier, Type, 7) #endif @@ -699,13 +703,15 @@ class Node { DEFINE_CLASS_ID(Parm, Proj, 4) DEFINE_CLASS_ID(MachProj, Proj, 5) - DEFINE_CLASS_ID(Mem, Node, 4) - DEFINE_CLASS_ID(Load, Mem, 0) + DEFINE_CLASS_ID(Mem, Node, 4) + DEFINE_CLASS_ID(Load, Mem, 0) DEFINE_CLASS_ID(LoadVector, Load, 0) DEFINE_CLASS_ID(LoadVectorGather, LoadVector, 0) + DEFINE_CLASS_ID(LoadVectorMasked, LoadVector, 1) DEFINE_CLASS_ID(Store, Mem, 1) DEFINE_CLASS_ID(StoreVector, Store, 0) DEFINE_CLASS_ID(StoreVectorScatter, StoreVector, 0) + DEFINE_CLASS_ID(StoreVectorMasked, StoreVector, 1) DEFINE_CLASS_ID(LoadStore, Mem, 2) DEFINE_CLASS_ID(LoadStoreConditional, LoadStore, 0) DEFINE_CLASS_ID(CompareAndSwap, LoadStoreConditional, 0) @@ -728,8 +734,6 @@ class Node { DEFINE_CLASS_ID(BoxLock, Node, 10) DEFINE_CLASS_ID(Add, Node, 11) DEFINE_CLASS_ID(Mul, Node, 12) - DEFINE_CLASS_ID(Vector, Node, 13) - DEFINE_CLASS_ID(VectorMaskCmp, Vector, 0) DEFINE_CLASS_ID(ClearArray, Node, 14) DEFINE_CLASS_ID(Halt, Node, 15) DEFINE_CLASS_ID(Opaque1, Node, 16) diff --git a/src/hotspot/share/opto/opcodes.cpp b/src/hotspot/share/opto/opcodes.cpp index e31e8d8477c..8c3860b4819 100644 --- a/src/hotspot/share/opto/opcodes.cpp +++ b/src/hotspot/share/opto/opcodes.cpp @@ -44,6 +44,7 @@ const char *NodeClassNames[] = { "VecX", "VecY", "VecZ", + "RegVectMask", "_last_machine_leaf", #include "classes.hpp" "_last_class_name", diff --git a/src/hotspot/share/opto/opcodes.hpp b/src/hotspot/share/opto/opcodes.hpp index ae3d61ce030..ec093d1bd3e 100644 --- a/src/hotspot/share/opto/opcodes.hpp +++ b/src/hotspot/share/opto/opcodes.hpp @@ -42,6 +42,7 @@ enum Opcodes { macro(VecX) // Machine vectorx register macro(VecY) // Machine vectory register macro(VecZ) // Machine vectorz register + macro(RegVectMask) // Vector mask/predicate register macro(RegFlags) // Machine flags register _last_machine_leaf, // Split between regular opcodes and machine #include "classes.hpp" diff --git a/src/hotspot/share/opto/optoreg.hpp b/src/hotspot/share/opto/optoreg.hpp index f01be1913e8..10dfd3d2d35 100644 --- a/src/hotspot/share/opto/optoreg.hpp +++ b/src/hotspot/share/opto/optoreg.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,7 +36,7 @@ // non-SSA names. A Register is represented as a number. Non-regular values // (e.g., Control, Memory, I/O) use the Special register. The actual machine // registers (as described in the ADL file for a machine) start at zero. -// Stack-slots (spill locations) start at the nest Chunk past the last machine +// Stack-slots (spill locations) start at the next Chunk past the last machine // register. // // Note that stack spill-slots are treated as a very large register set. diff --git a/src/hotspot/share/opto/regmask.cpp b/src/hotspot/share/opto/regmask.cpp index 2e04c42eb7e..d2c578367cc 100644 --- a/src/hotspot/share/opto/regmask.cpp +++ b/src/hotspot/share/opto/regmask.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -128,6 +128,8 @@ int RegMask::num_registers(uint ireg) { return 8; case Op_VecX: return 4; + case Op_RegVectMask: + return SlotsPerRegVectMask; case Op_VecD: case Op_RegD: case Op_RegL: diff --git a/src/hotspot/share/opto/regmask.hpp b/src/hotspot/share/opto/regmask.hpp index c64d0879592..3dbef345e18 100644 --- a/src/hotspot/share/opto/regmask.hpp +++ b/src/hotspot/share/opto/regmask.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -99,7 +99,9 @@ class RegMask { SlotsPerVecD = 2, SlotsPerVecX = 4, SlotsPerVecY = 8, - SlotsPerVecZ = 16 }; + SlotsPerVecZ = 16, + SlotsPerRegVectMask = X86_ONLY(2) NOT_X86(1) + }; // A constructor only used by the ADLC output. All mask fields are filled // in directly. Calls to this look something like RM(1,2,3,4); diff --git a/src/hotspot/share/opto/type.cpp b/src/hotspot/share/opto/type.cpp index f9bfe80de41..21045c3a8eb 100644 --- a/src/hotspot/share/opto/type.cpp +++ b/src/hotspot/share/opto/type.cpp @@ -67,18 +67,21 @@ const Type::TypeInfo Type::_type_info[Type::lastype] = { { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #elif defined(PPC64) + { Bad, T_ILLEGAL, "vectormask:", false, Op_RegVectMask, relocInfo::none }, // VectorMask. { Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #elif defined(S390) + { Bad, T_ILLEGAL, "vectormask:", false, Op_RegVectMask, relocInfo::none }, // VectorMask. { Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #else // all other + { Bad, T_ILLEGAL, "vectormask:", false, Op_RegVectMask, relocInfo::none }, // VectorMask. { Bad, T_ILLEGAL, "vectors:", false, Op_VecS, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_VecD, relocInfo::none }, // VectorD { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX @@ -663,6 +666,9 @@ void Type::Initialize_shared(Compile* current) { // get_zero_type() should not happen for T_CONFLICT _zero_type[T_CONFLICT]= NULL; + TypeVect::VECTMASK = (TypeVect*)(new TypeVectMask(TypeInt::BOOL, MaxVectorSize))->hashcons(); + mreg2type[Op_RegVectMask] = TypeVect::VECTMASK; + // Vector predefined types, it needs initialized _const_basic_type[]. if (Matcher::vector_size_supported(T_BYTE,4)) { TypeVect::VECTS = TypeVect::make(T_BYTE,4); @@ -2350,6 +2356,7 @@ const TypeVect *TypeVect::VECTD = NULL; // 64-bit vectors const TypeVect *TypeVect::VECTX = NULL; // 128-bit vectors const TypeVect *TypeVect::VECTY = NULL; // 256-bit vectors const TypeVect *TypeVect::VECTZ = NULL; // 512-bit vectors +const TypeVect *TypeVect::VECTMASK = NULL; // predicate/mask vector //------------------------------make------------------------------------------- const TypeVect* TypeVect::make(const Type *elem, uint length) { @@ -2376,6 +2383,15 @@ const TypeVect* TypeVect::make(const Type *elem, uint length) { return NULL; } +const TypeVect *TypeVect::makemask(const Type* elem, uint length) { + if (Matcher::has_predicated_vectors()) { + const TypeVect* mtype = Matcher::predicate_reg_type(elem, length); + return (TypeVect*)(const_cast(mtype))->hashcons(); + } else { + return make(elem, length); + } +} + //------------------------------meet------------------------------------------- // Compute the MEET of two types. It returns a new Type object. const Type *TypeVect::xmeet( const Type *t ) const { @@ -2390,7 +2406,13 @@ const Type *TypeVect::xmeet( const Type *t ) const { default: // All else is a mistake typerr(t); - + case VectorMask: { + const TypeVectMask* v = t->is_vectmask(); + assert( base() == v->base(), ""); + assert(length() == v->length(), ""); + assert(element_basic_type() == v->element_basic_type(), ""); + return TypeVect::makemask(_elem->xmeet(v->_elem), _length); + } case VectorS: case VectorD: case VectorX: @@ -2455,6 +2477,8 @@ void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const { st->print("vectory["); break; case VectorZ: st->print("vectorz["); break; + case VectorMask: + st->print("vectormask["); break; default: ShouldNotReachHere(); } @@ -2464,6 +2488,14 @@ void TypeVect::dump2(Dict &d, uint depth, outputStream *st) const { } #endif +bool TypeVectMask::eq(const Type *t) const { + const TypeVectMask *v = t->is_vectmask(); + return (element_type() == v->element_type()) && (length() == v->length()); +} + +const Type *TypeVectMask::xdual() const { + return new TypeVectMask(element_type()->dual(), length()); +} //============================================================================= // Convenience common pre-built types. diff --git a/src/hotspot/share/opto/type.hpp b/src/hotspot/share/opto/type.hpp index 2b2a703e7b6..b31a8d23956 100644 --- a/src/hotspot/share/opto/type.hpp +++ b/src/hotspot/share/opto/type.hpp @@ -58,6 +58,7 @@ class TypeVectD; class TypeVectX; class TypeVectY; class TypeVectZ; +class TypeVectMask; class TypePtr; class TypeRawPtr; class TypeOopPtr; @@ -87,6 +88,8 @@ class Type { Tuple, // Method signature or object layout Array, // Array types + + VectorMask, // Vector predicate/mask type VectorS, // 32bit Vector types VectorD, // 64bit Vector types VectorX, // 128bit Vector types @@ -295,6 +298,8 @@ class Type { const TypeAry *is_ary() const; // Array, NOT array pointer const TypeVect *is_vect() const; // Vector const TypeVect *isa_vect() const; // Returns NULL if not a Vector + const TypeVectMask *is_vectmask() const; // Predicate/Mask Vector + const TypeVectMask *isa_vectmask() const; // Returns NULL if not a Vector Predicate/Mask const TypePtr *is_ptr() const; // Asserts it is a ptr type const TypePtr *isa_ptr() const; // Returns NULL if not ptr type const TypeRawPtr *isa_rawptr() const; // NOT Java oop @@ -774,6 +779,13 @@ class TypeVect : public Type { // Used directly by Replicate nodes to construct singleton vector. static const TypeVect *make(const Type* elem, uint length); + static const TypeVect *makemask(const BasicType elem_bt, uint length) { + // Use bottom primitive type. + return makemask(get_const_basic_type(elem_bt), length); + } + static const TypeVect *makemask(const Type* elem, uint length); + + virtual const Type *xmeet( const Type *t) const; virtual const Type *xdual() const; // Compute dual right now. @@ -782,6 +794,7 @@ class TypeVect : public Type { static const TypeVect *VECTX; static const TypeVect *VECTY; static const TypeVect *VECTZ; + static const TypeVect *VECTMASK; #ifndef PRODUCT virtual void dump2(Dict &d, uint, outputStream *st) const; // Specialized per-Type dumping @@ -813,6 +826,14 @@ class TypeVectZ : public TypeVect { TypeVectZ(const Type* elem, uint length) : TypeVect(VectorZ, elem, length) {} }; +class TypeVectMask : public TypeVect { +public: + friend class TypeVect; + TypeVectMask(const Type* elem, uint length) : TypeVect(VectorMask, elem, length) {} + virtual bool eq(const Type *t) const; + virtual const Type *xdual() const; +}; + //------------------------------TypePtr---------------------------------------- // Class of machine Pointer Types: raw data, instances or arrays. // If the _base enum is AnyPtr, then this refers to all of the above. @@ -1637,13 +1658,22 @@ inline const TypeAry *Type::is_ary() const { return (TypeAry*)this; } +inline const TypeVectMask *Type::is_vectmask() const { + assert( _base == VectorMask, "Not a Vector Mask" ); + return (TypeVectMask*)this; +} + +inline const TypeVectMask *Type::isa_vectmask() const { + return (_base == VectorMask) ? (TypeVectMask*)this : NULL; +} + inline const TypeVect *Type::is_vect() const { - assert( _base >= VectorS && _base <= VectorZ, "Not a Vector" ); + assert( _base >= VectorMask && _base <= VectorZ, "Not a Vector" ); return (TypeVect*)this; } inline const TypeVect *Type::isa_vect() const { - return (_base >= VectorS && _base <= VectorZ) ? (TypeVect*)this : NULL; + return (_base >= VectorMask && _base <= VectorZ) ? (TypeVect*)this : NULL; } inline const TypePtr *Type::is_ptr() const { diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 723b35c78d4..c9fbe40401c 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -648,6 +648,41 @@ StoreVectorNode* StoreVectorNode::make(int opc, Node* ctl, Node* mem, return new StoreVectorNode(ctl, mem, adr, atyp, val); } +Node* LoadVectorMaskedNode::Ideal(PhaseGVN* phase, bool can_reshape) { + Node* mask_len = in(3)->in(1); + const TypeLong* ty = phase->type(mask_len)->isa_long(); + if (ty && ty->is_con()) { + BasicType mask_bt = ((VectorMaskGenNode*)in(3))->get_elem_type()->array_element_basic_type(); + uint load_sz = type2aelembytes(mask_bt) * ty->get_con(); + if ( load_sz == 32 || load_sz == 64) { + assert(load_sz == 32 || MaxVectorSize > 32, "Unexpected load size"); + Node* ctr = in(MemNode::Control); + Node* mem = in(MemNode::Memory); + Node* adr = in(MemNode::Address); + return phase->transform(new LoadVectorNode(ctr, mem, adr, adr_type(), vect_type())); + } + } + return NULL; +} + +Node* StoreVectorMaskedNode::Ideal(PhaseGVN* phase, bool can_reshape) { + Node* mask_len = in(4)->in(1); + const TypeLong* ty = phase->type(mask_len)->isa_long(); + if (ty && ty->is_con()) { + BasicType mask_bt = ((VectorMaskGenNode*)in(4))->get_elem_type()->array_element_basic_type(); + uint load_sz = type2aelembytes(mask_bt) * ty->get_con(); + if ( load_sz == 32 || load_sz == 64) { + assert(load_sz == 32 || MaxVectorSize > 32, "Unexpected store size"); + Node* ctr = in(MemNode::Control); + Node* mem = in(MemNode::Memory); + Node* adr = in(MemNode::Address); + Node* val = in(MemNode::ValueIn); + return phase->transform(new StoreVectorNode(ctr, mem, adr, adr_type(), val)); + } + } + return NULL; +} + int ExtractNode::opcode(BasicType bt) { switch (bt) { case T_BOOLEAN: return Op_ExtractUB; diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index eae8d2a9fa0..d2fd269616d 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -775,6 +775,59 @@ class StoreVectorNode : public StoreNode { idx == MemNode::ValueIn + 1; } }; +class StoreVectorMaskedNode : public StoreVectorNode { + public: + StoreVectorMaskedNode(Node* c, Node* mem, Node* dst, Node* src, const TypePtr* at, Node* mask) + : StoreVectorNode(c, mem, dst, at, src) { + assert(mask->bottom_type()->is_vectmask(), "sanity"); + init_class_id(Class_StoreVector); + set_mismatched_access(); + add_req(mask); + } + + virtual int Opcode() const; + + virtual uint match_edge(uint idx) const { + return idx > 1; + } + Node* Ideal(PhaseGVN* phase, bool can_reshape); +}; + +class LoadVectorMaskedNode : public LoadVectorNode { + public: + LoadVectorMaskedNode(Node* c, Node* mem, Node* src, const TypePtr* at, const TypeVect* vt, Node* mask) + : LoadVectorNode(c, mem, src, at, vt) { + assert(mask->bottom_type()->is_vectmask(), "sanity"); + init_class_id(Class_LoadVector); + set_mismatched_access(); + add_req(mask); + } + + virtual int Opcode() const; + + virtual uint match_edge(uint idx) const { + return idx > 1; + } + Node* Ideal(PhaseGVN* phase, bool can_reshape); +}; + +class VectorMaskGenNode : public TypeNode { + public: + VectorMaskGenNode(Node* length, const Type* ty, const Type* ety): TypeNode(ty, 2), _elemType(ety) { + init_req(1, length); + } + + virtual int Opcode() const; + const Type* get_elem_type() { return _elemType;} + virtual uint size_of() const { return sizeof(VectorMaskGenNode); } + virtual uint ideal_reg() const { + return Op_RegVectMask; + } + + private: + const Type* _elemType; +}; + //=========================Promote_Scalar_to_Vector============================ //------------------------------ReplicateBNode---------------------------------