diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp index c5188dc4..b2b0ced2 100644 --- a/CodeGen/src/IrLoweringX64.cpp +++ b/CodeGen/src/IrLoweringX64.cpp @@ -16,6 +16,7 @@ #include "lgc.h" LUAU_FASTFLAG(LuauCodegenVectorTag) +LUAU_FASTFLAGVARIABLE(LuauCodegenVectorOptAnd, false) namespace Luau { @@ -603,13 +604,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vaddps(inst.regX64, tmp1.reg, tmp2.reg); + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); + + build.vaddps(inst.regX64, tmpa, tmpb); if (!FFlag::LuauCodegenVectorTag) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); @@ -619,13 +620,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vsubps(inst.regX64, tmp1.reg, tmp2.reg); + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); + + build.vsubps(inst.regX64, tmpa, tmpb); if (!FFlag::LuauCodegenVectorTag) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; @@ -634,13 +635,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vmulps(inst.regX64, tmp1.reg, tmp2.reg); + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); + + build.vmulps(inst.regX64, tmpa, tmpb); if (!FFlag::LuauCodegenVectorTag) build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp()); break; @@ -649,13 +650,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next) { inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b}); - ScopedRegX64 tmp1{regs, SizeX64::xmmword}; - ScopedRegX64 tmp2{regs, SizeX64::xmmword}; + ScopedRegX64 tmp1{regs}; + ScopedRegX64 tmp2{regs}; - // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out - build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp()); - build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp()); - build.vdivps(inst.regX64, tmp1.reg, tmp2.reg); + RegisterX64 tmpa = vecOp(inst.a, tmp1); + RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2); + + build.vdivps(inst.regX64, tmpa, tmpb); if (!FFlag::LuauCodegenVectorTag) build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3); break; @@ -2234,6 +2235,24 @@ OperandX64 IrLoweringX64::bufferAddrOp(IrOp bufferOp, IrOp indexOp) return noreg; } +RegisterX64 IrLoweringX64::vecOp(IrOp op, ScopedRegX64& tmp) +{ + if (FFlag::LuauCodegenVectorOptAnd && FFlag::LuauCodegenVectorTag) + { + IrInst source = function.instOp(op); + CODEGEN_ASSERT(source.cmd != IrCmd::SUBSTITUTE); // we don't process substitutions + + // source that comes from memory or from tag instruction has .w = TVECTOR, which is denormal + // to avoid performance degradation on some CPUs we mask this component to produce zero + // otherwise we conservatively assume the vector is a result of a well formed math op so .w is a normal number or zero + if (source.cmd != IrCmd::LOAD_TVALUE && source.cmd != IrCmd::TAG_VECTOR) + return regOp(op); + } + tmp.alloc(SizeX64::xmmword); + build.vandps(tmp.reg, regOp(op), vectorAndMaskOp()); + return tmp.reg; +} + IrConst IrLoweringX64::constOp(IrOp op) const { return function.constOp(op); @@ -2279,6 +2298,7 @@ OperandX64 IrLoweringX64::vectorAndMaskOp() OperandX64 IrLoweringX64::vectorOrMaskOp() { + CODEGEN_ASSERT(!FFlag::LuauCodegenVectorTag); if (vectorOrMask.base == noreg) vectorOrMask = build.u32x4(0, 0, 0, LUA_TVECTOR); diff --git a/CodeGen/src/IrLoweringX64.h b/CodeGen/src/IrLoweringX64.h index f58a5d86..7ec4079e 100644 --- a/CodeGen/src/IrLoweringX64.h +++ b/CodeGen/src/IrLoweringX64.h @@ -51,6 +51,7 @@ struct IrLoweringX64 OperandX64 memRegTagOp(IrOp op); RegisterX64 regOp(IrOp op); OperandX64 bufferAddrOp(IrOp bufferOp, IrOp indexOp); + RegisterX64 vecOp(IrOp op, ScopedRegX64& tmp); IrConst constOp(IrOp op) const; uint8_t tagOp(IrOp op) const;