diff --git a/CodeGen/src/IrLoweringX64.cpp b/CodeGen/src/IrLoweringX64.cpp
index c5188dc4..b2b0ced2 100644
--- a/CodeGen/src/IrLoweringX64.cpp
+++ b/CodeGen/src/IrLoweringX64.cpp
@@ -16,6 +16,7 @@
 #include "lgc.h"
 
 LUAU_FASTFLAG(LuauCodegenVectorTag)
+LUAU_FASTFLAGVARIABLE(LuauCodegenVectorOptAnd, false)
 
 namespace Luau
 {
@@ -603,13 +604,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
-        ScopedRegX64 tmp1{regs, SizeX64::xmmword};
-        ScopedRegX64 tmp2{regs, SizeX64::xmmword};
+        ScopedRegX64 tmp1{regs};
+        ScopedRegX64 tmp2{regs};
 
-        // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
-        build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
-        build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
-        build.vaddps(inst.regX64, tmp1.reg, tmp2.reg);
+        RegisterX64 tmpa = vecOp(inst.a, tmp1);
+        RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
+
+        build.vaddps(inst.regX64, tmpa, tmpb);
 
         if (!FFlag::LuauCodegenVectorTag)
             build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
@@ -619,13 +620,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
-        ScopedRegX64 tmp1{regs, SizeX64::xmmword};
-        ScopedRegX64 tmp2{regs, SizeX64::xmmword};
+        ScopedRegX64 tmp1{regs};
+        ScopedRegX64 tmp2{regs};
 
-        // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
-        build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
-        build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
-        build.vsubps(inst.regX64, tmp1.reg, tmp2.reg);
+        RegisterX64 tmpa = vecOp(inst.a, tmp1);
+        RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
+
+        build.vsubps(inst.regX64, tmpa, tmpb);
         if (!FFlag::LuauCodegenVectorTag)
             build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
         break;
@@ -634,13 +635,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
-        ScopedRegX64 tmp1{regs, SizeX64::xmmword};
-        ScopedRegX64 tmp2{regs, SizeX64::xmmword};
+        ScopedRegX64 tmp1{regs};
+        ScopedRegX64 tmp2{regs};
 
-        // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
-        build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
-        build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
-        build.vmulps(inst.regX64, tmp1.reg, tmp2.reg);
+        RegisterX64 tmpa = vecOp(inst.a, tmp1);
+        RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
+
+        build.vmulps(inst.regX64, tmpa, tmpb);
         if (!FFlag::LuauCodegenVectorTag)
             build.vorps(inst.regX64, inst.regX64, vectorOrMaskOp());
         break;
@@ -649,13 +650,13 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, const IrBlock& next)
     {
         inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});
 
-        ScopedRegX64 tmp1{regs, SizeX64::xmmword};
-        ScopedRegX64 tmp2{regs, SizeX64::xmmword};
+        ScopedRegX64 tmp1{regs};
+        ScopedRegX64 tmp2{regs};
 
-        // Fourth component is the tag number which is interpreted as a denormal and has to be filtered out
-        build.vandps(tmp1.reg, regOp(inst.a), vectorAndMaskOp());
-        build.vandps(tmp2.reg, regOp(inst.b), vectorAndMaskOp());
-        build.vdivps(inst.regX64, tmp1.reg, tmp2.reg);
+        RegisterX64 tmpa = vecOp(inst.a, tmp1);
+        RegisterX64 tmpb = (inst.a == inst.b) ? tmpa : vecOp(inst.b, tmp2);
+
+        build.vdivps(inst.regX64, tmpa, tmpb);
         if (!FFlag::LuauCodegenVectorTag)
             build.vpinsrd(inst.regX64, inst.regX64, build.i32(LUA_TVECTOR), 3);
         break;
@@ -2234,6 +2235,24 @@ OperandX64 IrLoweringX64::bufferAddrOp(IrOp bufferOp, IrOp indexOp)
     return noreg;
 }
 
+RegisterX64 IrLoweringX64::vecOp(IrOp op, ScopedRegX64& tmp)
+{
+    if (FFlag::LuauCodegenVectorOptAnd && FFlag::LuauCodegenVectorTag)
+    {
+        IrInst source = function.instOp(op);
+        CODEGEN_ASSERT(source.cmd != IrCmd::SUBSTITUTE); // we don't process substitutions
+
+        // source that comes from memory or from tag instruction has .w = TVECTOR, which is denormal
+        // to avoid performance degradation on some CPUs we mask this component to produce zero
+        // otherwise we conservatively assume the vector is a result of a well formed math op so .w is a normal number or zero
+        if (source.cmd != IrCmd::LOAD_TVALUE && source.cmd != IrCmd::TAG_VECTOR)
+            return regOp(op);
+    }
+    tmp.alloc(SizeX64::xmmword);
+    build.vandps(tmp.reg, regOp(op), vectorAndMaskOp());
+    return tmp.reg;
+}
+
 IrConst IrLoweringX64::constOp(IrOp op) const
 {
     return function.constOp(op);
@@ -2279,6 +2298,7 @@ OperandX64 IrLoweringX64::vectorAndMaskOp()
 
 OperandX64 IrLoweringX64::vectorOrMaskOp()
 {
+    CODEGEN_ASSERT(!FFlag::LuauCodegenVectorTag);
     if (vectorOrMask.base == noreg)
         vectorOrMask = build.u32x4(0, 0, 0, LUA_TVECTOR);
 
diff --git a/CodeGen/src/IrLoweringX64.h b/CodeGen/src/IrLoweringX64.h
index f58a5d86..7ec4079e 100644
--- a/CodeGen/src/IrLoweringX64.h
+++ b/CodeGen/src/IrLoweringX64.h
@@ -51,6 +51,7 @@ struct IrLoweringX64
     OperandX64 memRegTagOp(IrOp op);
     RegisterX64 regOp(IrOp op);
     OperandX64 bufferAddrOp(IrOp bufferOp, IrOp indexOp);
+    RegisterX64 vecOp(IrOp op, ScopedRegX64& tmp);
 
     IrConst constOp(IrOp op) const;
     uint8_t tagOp(IrOp op) const;