Skip to content

Commit

Permalink
Merge pull request #15405 from unknownbrackets/softjit-sse2
Browse files Browse the repository at this point in the history
Fix some samplerjit issues without SSE4 or AVX
  • Loading branch information
hrydgard authored Feb 16, 2022
2 parents 7a7bb7d + ad18833 commit 9bc6b96
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 32 deletions.
20 changes: 10 additions & 10 deletions Common/x64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1698,7 +1698,7 @@ void XEmitter::PSRLW(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(dest));
Write8(shift);
}

Expand All @@ -1710,7 +1710,7 @@ void XEmitter::PSRLD(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(dest));
Write8(shift);
}

Expand All @@ -1722,7 +1722,7 @@ void XEmitter::PSRLQ(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(dest));
Write8(shift);
}

Expand All @@ -1734,7 +1734,7 @@ void XEmitter::PSRLDQ(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(dest));
Write8(shift);
}

Expand All @@ -1750,7 +1750,7 @@ void XEmitter::PSLLW(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(dest));
Write8(shift);
}

Expand All @@ -1762,7 +1762,7 @@ void XEmitter::PSLLD(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(dest));
Write8(shift);
}

Expand All @@ -1774,7 +1774,7 @@ void XEmitter::PSLLQ(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(dest));
Write8(shift);
}

Expand All @@ -1786,7 +1786,7 @@ void XEmitter::PSLLDQ(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
WriteSSEOp(0x66, 0x73, (X64Reg)7, R(dest));
Write8(shift);
}

Expand All @@ -1802,7 +1802,7 @@ void XEmitter::PSRAW(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
WriteSSEOp(0x66, 0x71, (X64Reg)4, R(dest));
Write8(shift);
}

Expand All @@ -1814,7 +1814,7 @@ void XEmitter::PSRAD(X64Reg dest, X64Reg reg, int shift) {
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg));
WriteSSEOp(0x66, 0x72, (X64Reg)4, R(dest));
Write8(shift);
}

Expand Down
39 changes: 17 additions & 22 deletions GPU/Software/SamplerX86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ FetchFunc SamplerJitCache::CompileFetch(const SamplerID &id) {
if (cpu_info.bSSE4_1) {
PMOVZXBD(vecResultReg, R(vecResultReg));
} else {
X64Reg vecTempReg = regCache_.Find(RegCache::VEC_TEMP0);
X64Reg vecTempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
PXOR(vecTempReg, R(vecTempReg));
PUNPCKLBW(vecResultReg, R(vecTempReg));
PUNPCKLWD(vecResultReg, R(vecTempReg));
regCache_.Unlock(vecTempReg, RegCache::VEC_TEMP0);
regCache_.Release(vecTempReg, RegCache::VEC_TEMP0);
}
regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);

Expand Down Expand Up @@ -1068,17 +1068,17 @@ bool SamplerJitCache::Jit_GetDataQuad(const SamplerID &id, bool level1, int bits
if (i != 3)
PSRLDQ(byteOffsetReg, 4);
if (bitsPerTexel <= 8) {
MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_2, 0));
MOVZX(32, 8, temp2Reg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
PINSRW(destReg, R(temp2Reg), i * 2);
} else if (bitsPerTexel == 16) {
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_2, 0), i * 2);
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
} else if (bitsPerTexel == 32) {
if (i == 0) {
MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_2, 0));
MOVD_xmm(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0));
} else {
// Maybe a temporary would be better, but this path should be rare.
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_2, 0), i * 2);
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_2, 2), i * 2 + 1);
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 0), i * 2);
PINSRW(destReg, MComplex(baseReg, temp2Reg, SCALE_1, 2), i * 2 + 1);
}
}
}
Expand Down Expand Up @@ -1410,18 +1410,12 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);

X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
if (!cpu_info.bSSE4_1) {
X64Reg zeroReg = GetZeroVec();
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PUNPCKLBW(topReg, R(zeroReg));
PUNPCKLBW(bottomReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
} else {
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PMOVZXBW(topReg, R(quadReg));
PMOVZXBW(bottomReg, R(bottomReg));
}
X64Reg zeroReg = GetZeroVec();
PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
PUNPCKLBW(topReg, R(zeroReg));
PUNPCKLBW(bottomReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
if (!level1) {
regCache_.Unlock(quadReg, RegCache::VEC_RESULT);
regCache_.ForceRelease(RegCache::VEC_RESULT);
Expand All @@ -1440,7 +1434,7 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
MOVDQA(fracMulReg, M(const10Low_));
PSUBW(fracMulReg, R(fracReg));
// Then we just shift and OR in the original frac_u.
// Then we just put the original frac_u in the upper bits.
PUNPCKLQDQ(fracMulReg, R(fracReg));
regCache_.Release(fracReg, RegCache::VEC_TEMP2);

Expand All @@ -1457,7 +1451,7 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
if (level1) {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(3, 3, 3, 3));
} else {
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(2, 2, 2, 2));
PSHUFLW(fracReg, R(allFracReg), _MM_SHUFFLE(1, 1, 1, 1));
}
PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(allFracReg, RegCache::VEC_FRAC);
Expand Down Expand Up @@ -3024,7 +3018,8 @@ bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache
X64Reg bufwVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR); if (cpu_info.bAVX2) {
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bAVX2) {
VPBROADCASTD(128, bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
} else {
MOVD_xmm(bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
Expand Down

0 comments on commit 9bc6b96

Please sign in to comment.