CodeGen: Optimize vector ops for X64 when the source is computed (#1174)

With the TAG_VECTOR change, we can now confidently distinguish cases
when the .w component
contains TVECTOR tag from cases where it doesn't: loads and tag ops
produce the tag, whereas
other instructions don't.

We now take advantage of this fact and only apply vandps with a mask
when we need to.

It would be possible to use a positive filter (explicitly checking for
source coming from ADD_VEC
et al), but there are more instructions to check this way and this is
purely an optimization so
it is allowed to be conservative (as in, the cost of a mistake here is a
potential slowdown,
not a correctness issue).

Additionally, this change only performs vandps once when the arguments
are the same instead
of doing it twice.

On the function that computes a polynomial approximation this change
makes it ~20% faster on Zen4.
This commit is contained in:
Arseny Kapoulkine 2024-03-01 03:32:43 -08:00 committed by GitHub
parent c9324853e5
commit cc51e616ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 24 deletions

View File

@ -16,6 +16,7 @@
#include "lgc.h"
LUAU_FASTFLAG(LuauCodegenVectorTag)
LUAU_FASTFLAGVARIABLE(LuauCodegenVectorOptAnd, false)
namespace Luau
{
@ -603,13 +604,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
ScopedRegX64 tmp1{regs, SizeX64::xmmword};
ScopedRegX64 tmp2{regs, SizeX64::xmmword};
ScopedRegX64 tmp1{regs};
ScopedRegX64 tmp2{regs};
// Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
build.vaddps(inst.regX64, tmp1.reg, tmp2.reg);
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vaddps(inst.regX64, tmpa, tmpb);
if (!FFlag::LuauCodegenVectorTag)
build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
@ -619,13 +620,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
ScopedRegX64 tmp1{regs, SizeX64::xmmword};
ScopedRegX64 tmp2{regs, SizeX64::xmmword};
ScopedRegX64 tmp1{regs};
ScopedRegX64 tmp2{regs};
// Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
build.vsubps(inst.regX64, tmp1.reg, tmp2.reg);
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vsubps(inst.regX64, tmpa, tmpb);
if (!FFlag::LuauCodegenVectorTag)
build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
break;
@ -634,13 +635,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
ScopedRegX64 tmp1{regs, SizeX64::xmmword};
ScopedRegX64 tmp2{regs, SizeX64::xmmword};
ScopedRegX64 tmp1{regs};
ScopedRegX64 tmp2{regs};
// Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
build.vmulps(inst.regX64, tmp1.reg, tmp2.reg);
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vmulps(inst.regX64, tmpa, tmpb);
if (!FFlag::LuauCodegenVectorTag)
build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
break;
@ -649,13 +650,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
{
inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
ScopedRegX64 tmp1{regs, SizeX64::xmmword};
ScopedRegX64 tmp2{regs, SizeX64::xmmword};
ScopedRegX64 tmp1{regs};
ScopedRegX64 tmp2{regs};
// Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
build.vdivps(inst.regX64, tmp1.reg, tmp2.reg);
RegisterX64 tmpa = vecOp(inst.a, tmp1);
RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
build.vdivps(inst.regX64, tmpa, tmpb);
if (!FFlag::LuauCodegenVectorTag)
build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
break;
@ -2234,6 +2235,24 @@ OperandX64 IrLoweringX64::bufferAddrOp(IrOp bufferOp, IrOp indexOp)
return noreg;
}
RegisterX64 IrLoweringX64::vecOp(IrOp op, ScopedRegX64& tmp)
{
if (FFlag::LuauCodegenVectorOptAnd && FFlag::LuauCodegenVectorTag)
{
IrInst source = function.instOp(op);
CODEGEN_ASSERT(source.cmd != IrCmd::SUBSTITUTE); // we don't process substitutions
// source that comes from memory or from tag instruction has .w = TVECTOR, which is denormal
// to avoid performance degradation on some CPUs we mask this component to produce zero
// otherwise we conservatively assume the vector is a result of a well formed math op so .w is a normal number or zero
if (source.cmd != IrCmd::LOAD_TVALUE && source.cmd != IrCmd::TAG_VECTOR)
return regOp(op);
}
tmp.alloc(SizeX64::xmmword);
build.vandps(tmp.reg, regOp(op), vectorAndMaskOp());
return tmp.reg;
}
IrConst IrLoweringX64::constOp(IrOp op) const
{
return function.constOp(op);
@ -2279,6 +2298,7 @@ OperandX64 IrLoweringX64::vectorAndMaskOp()
OperandX64 IrLoweringX64::vectorOrMaskOp()
{
CODEGEN_ASSERT(!FFlag::LuauCodegenVectorTag);
if (vectorOrMask.base == noreg)
vectorOrMask = build.u32x4(0, 0, 0, LUA_TVECTOR);

View File

@ -51,6 +51,7 @@ struct IrLoweringX64
OperandX64 memRegTagOp(IrOp op);
RegisterX64 regOp(IrOp op);
OperandX64 bufferAddrOp(IrOp bufferOp, IrOp indexOp);
RegisterX64 vecOp(IrOp op, ScopedRegX64& tmp);
IrConst constOp(IrOp op) const;
uint8_t tagOp(IrOp op) const;