Sync to upstream/release/572 (#899)

* Fixed exported types not being suggested in autocomplete * `T...` is now convertible to `...any` (Fixes https://github.com/Roblox/luau/issues/767) * Fixed issue with `T?` not being convertible to `T | T` or `T?` (sometimes when internal pointer identity is different) * Fixed potential crash in missing table key error suggestion to use a similar existing key * `lua_topointer` now returns a pointer for strings C++ API Changes: * `prepareModuleScope` callback has moved from TypeChecker to Frontend * For LSPs, AstQuery functions (and `isWithinComment`) can be used without full Frontend data A lot of changes in our two experimental components as well. In our work on the new type-solver, the following issues were fixed: * Fixed table union and intersection indexing * Correct custom type environments are now used * Fixed issue with values of `free & number` type not accepted in numeric operations And these are the changes in native code generation (JIT): * arm64 lowering is almost complete with support for 99% of IR commands and all fastcalls * Fixed x64 assembly encoding for extended byte registers * More external x64 calls are aware of register allocator * `math.min`/`math.max` with more than 2 arguments are now lowered to IR as well * Fixed correctness issues with `math` library calls with multiple results in variadic context and with x64 register conflicts * x64 register allocator learnt to restore values from VM memory instead of always using stack spills * x64 exception unwind information now supports multiple functions and fixes function start offset in Dwarf2 info
2024-11-15 06:15:44 +08:00 · 2023-04-14 21:06:22 +03:00 · 2023-04-14 21:06:22 +03:00 · d141a5c48d
commit d141a5c48d
parent 7345891f6b
88 changed files with 2579 additions and 1433 deletions
--- a/Analysis/include/Luau/AstQuery.h
+++ b/Analysis/include/Luau/AstQuery.h
@ -64,8 +64,11 @@ private:
 };

 std::vector<AstNode*> findAncestryAtPositionForAutocomplete(const SourceModule& source, Position pos);
+std::vector<AstNode*> findAncestryAtPositionForAutocomplete(AstStatBlock* root, Position pos);
 std::vector<AstNode*> findAstAncestryOfPosition(const SourceModule& source, Position pos, bool includeTypes = false);
+std::vector<AstNode*> findAstAncestryOfPosition(AstStatBlock* root, Position pos, bool includeTypes = false);
 AstNode* findNodeAtPosition(const SourceModule& source, Position pos);
+AstNode* findNodeAtPosition(AstStatBlock* root, Position pos);
 AstExpr* findExprAtPosition(const SourceModule& source, Position pos);
 ScopePtr findScopeAtPosition(const Module& module, Position pos);
 std::optional<Binding> findBindingAtPosition(const Module& module, const SourceModule& source, Position pos);
--- a/Analysis/include/Luau/Frontend.h
+++ b/Analysis/include/Luau/Frontend.h
@ -165,7 +165,15 @@ struct Frontend
        bool captureComments, bool typeCheckForAutocomplete = false);

 private:
-    ModulePtr check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, bool forAutocomplete = false, bool recordJsonLog = false);
+    struct TypeCheckLimits
+    {
+        std::optional<double> finishTime;
+        std::optional<int> instantiationChildLimit;
+        std::optional<int> unifierIterationLimit;
+    };
+
+    ModulePtr check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, std::optional<ScopePtr> environmentScope,
+        bool forAutocomplete, bool recordJsonLog, TypeCheckLimits typeCheckLimits);

    std::pair<SourceNode*, SourceModule*> getSourceNode(const ModuleName& name);
    SourceModule parse(const ModuleName& name, std::string_view src, const ParseOptions& parseOptions);
@ -185,15 +193,21 @@ public:
    const NotNull<BuiltinTypes> builtinTypes;

    FileResolver* fileResolver;
+
    FrontendModuleResolver moduleResolver;
    FrontendModuleResolver moduleResolverForAutocomplete;
+
    GlobalTypes globals;
    GlobalTypes globalsForAutocomplete;
-    TypeChecker typeChecker;
-    TypeChecker typeCheckerForAutocomplete;
+
+    // TODO: remove with FFlagLuauOnDemandTypecheckers
+    TypeChecker typeChecker_DEPRECATED;
+    TypeChecker typeCheckerForAutocomplete_DEPRECATED;
+
    ConfigResolver* configResolver;
    FrontendOptions options;
    InternalErrorReporter iceHandler;
+    std::function<void(const ModuleName& name, const ScopePtr& scope, bool forAutocomplete)> prepareModuleScope;

    std::unordered_map<ModuleName, SourceNode> sourceNodes;
    std::unordered_map<ModuleName, SourceModule> sourceModules;
--- a/Analysis/include/Luau/Module.h
+++ b/Analysis/include/Luau/Module.h
@ -51,6 +51,7 @@ struct SourceModule
 };

 bool isWithinComment(const SourceModule& sourceModule, Position pos);
+bool isWithinComment(const ParseResult& result, Position pos);

 struct RequireCycle
 {
--- a/Analysis/include/Luau/Type.h
+++ b/Analysis/include/Luau/Type.h
@ -738,6 +738,7 @@ const T* get(TypeId tv)
    return get_if<T>(&tv->ty);
 }

+
 template<typename T>
 T* getMutable(TypeId tv)
 {
@ -897,6 +898,19 @@ bool hasTag(TypeId ty, const std::string& tagName);
 bool hasTag(const Property& prop, const std::string& tagName);
 bool hasTag(const Tags& tags, const std::string& tagName); // Do not use in new work.

+template<typename T>
+bool hasTypeInIntersection(TypeId ty)
+{
+    TypeId tf = follow(ty);
+    if (get<T>(tf))
+        return true;
+    for (auto t : flattenIntersection(tf))
+        if (get<T>(follow(t)))
+            return true;
+    return false;
+}
+
+bool hasPrimitiveTypeInIntersection(TypeId ty, PrimitiveType::Type primTy);
 /*
 * Use this to change the kind of a particular type.
 *
--- a/Analysis/include/Luau/Unifier.h
+++ b/Analysis/include/Luau/Unifier.h
@ -137,9 +137,9 @@ private:

 public:
    // Returns true if the type "needle" already occurs within "haystack" and reports an "infinite type error"
-    bool occursCheck(TypeId needle, TypeId haystack);
+    bool occursCheck(TypeId needle, TypeId haystack, bool reversed);
    bool occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId haystack);
-    bool occursCheck(TypePackId needle, TypePackId haystack);
+    bool occursCheck(TypePackId needle, TypePackId haystack, bool reversed);
    bool occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, TypePackId haystack);

    Unifier makeChildUnifier();
--- a/Analysis/src/AstQuery.cpp
+++ b/Analysis/src/AstQuery.cpp
@ -211,33 +211,48 @@ struct FindFullAncestry final : public AstVisitor

 std::vector<AstNode*> findAncestryAtPositionForAutocomplete(const SourceModule& source, Position pos)
 {
-    AutocompleteNodeFinder finder{pos, source.root};
-    source.root->visit(&finder);
+    return findAncestryAtPositionForAutocomplete(source.root, pos);
+}
+
+std::vector<AstNode*> findAncestryAtPositionForAutocomplete(AstStatBlock* root, Position pos)
+{
+    AutocompleteNodeFinder finder{pos, root};
+    root->visit(&finder);
    return finder.ancestry;
 }

 std::vector<AstNode*> findAstAncestryOfPosition(const SourceModule& source, Position pos, bool includeTypes)
 {
-    const Position end = source.root->location.end;
+    return findAstAncestryOfPosition(source.root, pos, includeTypes);
+}
+
+std::vector<AstNode*> findAstAncestryOfPosition(AstStatBlock* root, Position pos, bool includeTypes)
+{
+    const Position end = root->location.end;
    if (pos > end)
        pos = end;

    FindFullAncestry finder(pos, end, includeTypes);
-    source.root->visit(&finder);
+    root->visit(&finder);
    return finder.nodes;
 }

 AstNode* findNodeAtPosition(const SourceModule& source, Position pos)
 {
-    const Position end = source.root->location.end;
-    if (pos < source.root->location.begin)
-        return source.root;
+    return findNodeAtPosition(source.root, pos);
+}
+
+AstNode* findNodeAtPosition(AstStatBlock* root, Position pos)
+{
+    const Position end = root->location.end;
+    if (pos < root->location.begin)
+        return root;

    if (pos > end)
        pos = end;

    FindNode findNode{pos, end};
-    findNode.visit(source.root);
+    findNode.visit(root);
    return findNode.best;
 }

--- a/Analysis/src/ConstraintSolver.cpp
+++ b/Analysis/src/ConstraintSolver.cpp
@ -595,6 +595,11 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
     * make any sense to stop and wait for someone else to do it.
     */

+    // If any is present, the expression must evaluate to any as well.
+    bool leftAny = get<AnyType>(leftType) || get<ErrorType>(leftType);
+    bool rightAny = get<AnyType>(rightType) || get<ErrorType>(rightType);
+    bool anyPresent = leftAny || rightAny;
+
    if (isBlocked(leftType) && leftType != resultType)
        return block(c.leftType, constraint);

@ -604,12 +609,12 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
    if (!force)
    {
        // Logical expressions may proceed if the LHS is free.
-        if (get<FreeType>(leftType) && !isLogical)
+        if (hasTypeInIntersection<FreeType>(leftType) && !isLogical)
            return block(leftType, constraint);
    }

    // Logical expressions may proceed if the LHS is free.
-    if (isBlocked(leftType) || (get<FreeType>(leftType) && !isLogical))
+    if (isBlocked(leftType) || (hasTypeInIntersection<FreeType>(leftType) && !isLogical))
    {
        asMutable(resultType)->ty.emplace<BoundType>(errorRecoveryType());
        unblock(resultType);
@ -696,11 +701,6 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
        // If there's no metamethod available, fall back to primitive behavior.
    }

-    // If any is present, the expression must evaluate to any as well.
-    bool leftAny = get<AnyType>(leftType) || get<ErrorType>(leftType);
-    bool rightAny = get<AnyType>(rightType) || get<ErrorType>(rightType);
-    bool anyPresent = leftAny || rightAny;
-
    switch (c.op)
    {
    // For arithmetic operators, if the LHS is a number, the RHS must be a
@ -711,6 +711,8 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
    case AstExprBinary::Op::Div:
    case AstExprBinary::Op::Pow:
    case AstExprBinary::Op::Mod:
+        if (hasTypeInIntersection<FreeType>(leftType) && force)
+            asMutable(leftType)->ty.emplace<BoundType>(anyPresent ? builtinTypes->anyType : builtinTypes->numberType);
        if (isNumber(leftType))
        {
            unify(leftType, rightType, constraint->scope);
@ -723,6 +725,8 @@ bool ConstraintSolver::tryDispatch(const BinaryConstraint& c, NotNull<const Cons
    // For concatenation, if the LHS is a string, the RHS must be a string as
    // well. The result will also be a string.
    case AstExprBinary::Op::Concat:
+        if (hasTypeInIntersection<FreeType>(leftType) && force)
+            asMutable(leftType)->ty.emplace<BoundType>(anyPresent ? builtinTypes->anyType : builtinTypes->stringType);
        if (isString(leftType))
        {
            unify(leftType, rightType, constraint->scope);
--- a/Analysis/src/Frontend.cpp
+++ b/Analysis/src/Frontend.cpp
@ -31,7 +31,8 @@ LUAU_FASTFLAG(LuauInferInNoCheckMode)
 LUAU_FASTFLAGVARIABLE(LuauKnowsTheDataModel3, false)
 LUAU_FASTINTVARIABLE(LuauAutocompleteCheckTimeoutMs, 100)
 LUAU_FASTFLAGVARIABLE(DebugLuauDeferredConstraintResolution, false)
-LUAU_FASTFLAGVARIABLE(DebugLuauLogSolverToJson, false);
+LUAU_FASTFLAGVARIABLE(DebugLuauLogSolverToJson, false)
+LUAU_FASTFLAGVARIABLE(LuauOnDemandTypecheckers, false)

 namespace Luau
 {
@ -131,8 +132,8 @@ static void persistCheckedTypes(ModulePtr checkedModule, GlobalTypes& globals, S
 LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, ScopePtr targetScope, std::string_view source,
    const std::string& packageName, bool captureComments, bool typeCheckForAutocomplete)
 {
-    if (!FFlag::DebugLuauDeferredConstraintResolution)
-        return Luau::loadDefinitionFileNoDCR(typeCheckForAutocomplete ? typeCheckerForAutocomplete : typeChecker,
+    if (!FFlag::DebugLuauDeferredConstraintResolution && !FFlag::LuauOnDemandTypecheckers)
+        return Luau::loadDefinitionFileNoDCR(typeCheckForAutocomplete ? typeCheckerForAutocomplete_DEPRECATED : typeChecker_DEPRECATED,
            typeCheckForAutocomplete ? globalsForAutocomplete : globals, targetScope, source, packageName, captureComments);

    LUAU_TIMETRACE_SCOPE("loadDefinitionFile", "Frontend");
@ -142,7 +143,7 @@ LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, Scop
    if (parseResult.errors.size() > 0)
        return LoadDefinitionFileResult{false, parseResult, sourceModule, nullptr};

-    ModulePtr checkedModule = check(sourceModule, Mode::Definition, {});
+    ModulePtr checkedModule = check(sourceModule, Mode::Definition, {}, std::nullopt, /*forAutocomplete*/ false, /*recordJsonLog*/ false, {});

    if (checkedModule->errors.size() > 0)
        return LoadDefinitionFileResult{false, parseResult, sourceModule, checkedModule};
@ -155,6 +156,7 @@ LoadDefinitionFileResult Frontend::loadDefinitionFile(GlobalTypes& globals, Scop
 LoadDefinitionFileResult loadDefinitionFileNoDCR(TypeChecker& typeChecker, GlobalTypes& globals, ScopePtr targetScope, std::string_view source,
    const std::string& packageName, bool captureComments)
 {
+    LUAU_ASSERT(!FFlag::LuauOnDemandTypecheckers);
    LUAU_TIMETRACE_SCOPE("loadDefinitionFile", "Frontend");

    Luau::SourceModule sourceModule;
@ -406,8 +408,8 @@ Frontend::Frontend(FileResolver* fileResolver, ConfigResolver* configResolver, c
    , moduleResolverForAutocomplete(this)
    , globals(builtinTypes)
    , globalsForAutocomplete(builtinTypes)
-    , typeChecker(globals.globalScope, &moduleResolver, builtinTypes, &iceHandler)
-    , typeCheckerForAutocomplete(globalsForAutocomplete.globalScope, &moduleResolverForAutocomplete, builtinTypes, &iceHandler)
+    , typeChecker_DEPRECATED(globals.globalScope, &moduleResolver, builtinTypes, &iceHandler)
+    , typeCheckerForAutocomplete_DEPRECATED(globalsForAutocomplete.globalScope, &moduleResolverForAutocomplete, builtinTypes, &iceHandler)
    , configResolver(configResolver)
    , options(options)
 {
@ -491,35 +493,68 @@ CheckResult Frontend::check(const ModuleName& name, std::optional<FrontendOption

        if (frontendOptions.forAutocomplete)
        {
-            // The autocomplete typecheck is always in strict mode with DM awareness
-            // to provide better type information for IDE features
-            typeCheckerForAutocomplete.requireCycles = requireCycles;
+            ModulePtr moduleForAutocomplete;

            double autocompleteTimeLimit = FInt::LuauAutocompleteCheckTimeoutMs / 1000.0;

-            if (autocompleteTimeLimit != 0.0)
-                typeCheckerForAutocomplete.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
-            else
-                typeCheckerForAutocomplete.finishTime = std::nullopt;
+            if (!FFlag::LuauOnDemandTypecheckers)
+            {
+                // The autocomplete typecheck is always in strict mode with DM awareness
+                // to provide better type information for IDE features
+                typeCheckerForAutocomplete_DEPRECATED.requireCycles = requireCycles;

-            // TODO: This is a dirty ad hoc solution for autocomplete timeouts
-            // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
-            // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
-            if (FInt::LuauTarjanChildLimit > 0)
-                typeCheckerForAutocomplete.instantiationChildLimit = std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
-            else
-                typeCheckerForAutocomplete.instantiationChildLimit = std::nullopt;
+                if (autocompleteTimeLimit != 0.0)
+                    typeCheckerForAutocomplete_DEPRECATED.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.finishTime = std::nullopt;

-            if (FInt::LuauTypeInferIterationLimit > 0)
-                typeCheckerForAutocomplete.unifierIterationLimit =
-                    std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
-            else
-                typeCheckerForAutocomplete.unifierIterationLimit = std::nullopt;
+                // TODO: This is a dirty ad hoc solution for autocomplete timeouts
+                // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
+                // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
+                if (FInt::LuauTarjanChildLimit > 0)
+                    typeCheckerForAutocomplete_DEPRECATED.instantiationChildLimit =
+                        std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.instantiationChildLimit = std::nullopt;

-            ModulePtr moduleForAutocomplete =
-                FFlag::DebugLuauDeferredConstraintResolution
-                    ? check(sourceModule, Mode::Strict, requireCycles, /*forAutocomplete*/ true, /*recordJsonLog*/ false)
-                    : typeCheckerForAutocomplete.check(sourceModule, Mode::Strict, environmentScope);
+                if (FInt::LuauTypeInferIterationLimit > 0)
+                    typeCheckerForAutocomplete_DEPRECATED.unifierIterationLimit =
+                        std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckerForAutocomplete_DEPRECATED.unifierIterationLimit = std::nullopt;
+
+                moduleForAutocomplete =
+                    FFlag::DebugLuauDeferredConstraintResolution
+                        ? check(sourceModule, Mode::Strict, requireCycles, environmentScope, /*forAutocomplete*/ true, /*recordJsonLog*/ false, {})
+                        : typeCheckerForAutocomplete_DEPRECATED.check(sourceModule, Mode::Strict, environmentScope);
+            }
+            else
+            {
+                // The autocomplete typecheck is always in strict mode with DM awareness
+                // to provide better type information for IDE features
+                TypeCheckLimits typeCheckLimits;
+
+                if (autocompleteTimeLimit != 0.0)
+                    typeCheckLimits.finishTime = TimeTrace::getClock() + autocompleteTimeLimit;
+                else
+                    typeCheckLimits.finishTime = std::nullopt;
+
+                // TODO: This is a dirty ad hoc solution for autocomplete timeouts
+                // We are trying to dynamically adjust our existing limits to lower total typechecking time under the limit
+                // so that we'll have type information for the whole file at lower quality instead of a full abort in the middle
+                if (FInt::LuauTarjanChildLimit > 0)
+                    typeCheckLimits.instantiationChildLimit = std::max(1, int(FInt::LuauTarjanChildLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckLimits.instantiationChildLimit = std::nullopt;
+
+                if (FInt::LuauTypeInferIterationLimit > 0)
+                    typeCheckLimits.unifierIterationLimit = std::max(1, int(FInt::LuauTypeInferIterationLimit * sourceNode.autocompleteLimitsMult));
+                else
+                    typeCheckLimits.unifierIterationLimit = std::nullopt;
+
+                moduleForAutocomplete = check(sourceModule, Mode::Strict, requireCycles, environmentScope, /*forAutocomplete*/ true,
+                    /*recordJsonLog*/ false, typeCheckLimits);
+            }

            moduleResolverForAutocomplete.modules[moduleName] = moduleForAutocomplete;

@ -543,13 +578,22 @@ CheckResult Frontend::check(const ModuleName& name, std::optional<FrontendOption
            continue;
        }

-        typeChecker.requireCycles = requireCycles;
-
        const bool recordJsonLog = FFlag::DebugLuauLogSolverToJson && moduleName == name;

-        ModulePtr module = (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
-                               ? check(sourceModule, mode, requireCycles, /*forAutocomplete*/ false, recordJsonLog)
-                               : typeChecker.check(sourceModule, mode, environmentScope);
+        ModulePtr module;
+
+        if (!FFlag::LuauOnDemandTypecheckers)
+        {
+            typeChecker_DEPRECATED.requireCycles = requireCycles;
+
+            module = (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
+                         ? check(sourceModule, mode, requireCycles, environmentScope, /*forAutocomplete*/ false, recordJsonLog, {})
+                         : typeChecker_DEPRECATED.check(sourceModule, mode, environmentScope);
+        }
+        else
+        {
+            module = check(sourceModule, mode, requireCycles, environmentScope, /*forAutocomplete*/ false, recordJsonLog, {});
+        }

        stats.timeCheck += getTimestamp() - timestamp;
        stats.filesStrict += mode == Mode::Strict;
@ -752,7 +796,7 @@ ScopePtr Frontend::getModuleEnvironment(const SourceModule& module, const Config
            AstName name = module.names->get(global.c_str());

            if (name.value)
-                result->bindings[name].typeId = typeChecker.anyType;
+                result->bindings[name].typeId = FFlag::LuauOnDemandTypecheckers ? builtinTypes->anyType : typeChecker_DEPRECATED.anyType;
        }
    }

@ -829,15 +873,15 @@ const SourceModule* Frontend::getSourceModule(const ModuleName& moduleName) cons

 ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle>& requireCycles, NotNull<BuiltinTypes> builtinTypes,
    NotNull<InternalErrorReporter> iceHandler, NotNull<ModuleResolver> moduleResolver, NotNull<FileResolver> fileResolver,
-    const ScopePtr& globalScope, FrontendOptions options)
+    const ScopePtr& parentScope, FrontendOptions options)
 {
    const bool recordJsonLog = FFlag::DebugLuauLogSolverToJson;
-    return check(sourceModule, requireCycles, builtinTypes, iceHandler, moduleResolver, fileResolver, globalScope, options, recordJsonLog);
+    return check(sourceModule, requireCycles, builtinTypes, iceHandler, moduleResolver, fileResolver, parentScope, options, recordJsonLog);
 }

 ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle>& requireCycles, NotNull<BuiltinTypes> builtinTypes,
    NotNull<InternalErrorReporter> iceHandler, NotNull<ModuleResolver> moduleResolver, NotNull<FileResolver> fileResolver,
-    const ScopePtr& globalScope, FrontendOptions options, bool recordJsonLog)
+    const ScopePtr& parentScope, FrontendOptions options, bool recordJsonLog)
 {
    ModulePtr result = std::make_shared<Module>();
    result->reduction = std::make_unique<TypeReduction>(NotNull{&result->internalTypes}, builtinTypes, iceHandler);
@ -868,7 +912,7 @@ ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle
        moduleResolver,
        builtinTypes,
        iceHandler,
-        globalScope,
+        parentScope,
        logger.get(),
        NotNull{&dfg},
    };
@ -911,11 +955,35 @@ ModulePtr check(const SourceModule& sourceModule, const std::vector<RequireCycle
    return result;
 }

-ModulePtr Frontend::check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles, bool forAutocomplete, bool recordJsonLog)
+ModulePtr Frontend::check(const SourceModule& sourceModule, Mode mode, std::vector<RequireCycle> requireCycles,
+    std::optional<ScopePtr> environmentScope, bool forAutocomplete, bool recordJsonLog, TypeCheckLimits typeCheckLimits)
 {
-    return Luau::check(sourceModule, requireCycles, builtinTypes, NotNull{&iceHandler},
-        NotNull{forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver}, NotNull{fileResolver},
-        forAutocomplete ? globalsForAutocomplete.globalScope : globals.globalScope, options, recordJsonLog);
+    if (FFlag::DebugLuauDeferredConstraintResolution && mode == Mode::Strict)
+    {
+        return Luau::check(sourceModule, requireCycles, builtinTypes, NotNull{&iceHandler},
+            NotNull{forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver}, NotNull{fileResolver},
+            environmentScope ? *environmentScope : globals.globalScope, options, recordJsonLog);
+    }
+    else
+    {
+        LUAU_ASSERT(FFlag::LuauOnDemandTypecheckers);
+
+        TypeChecker typeChecker(globals.globalScope, forAutocomplete ? &moduleResolverForAutocomplete : &moduleResolver, builtinTypes, &iceHandler);
+
+        if (prepareModuleScope)
+        {
+            typeChecker.prepareModuleScope = [this, forAutocomplete](const ModuleName& name, const ScopePtr& scope) {
+                prepareModuleScope(name, scope, forAutocomplete);
+            };
+        }
+
+        typeChecker.requireCycles = requireCycles;
+        typeChecker.finishTime = typeCheckLimits.finishTime;
+        typeChecker.instantiationChildLimit = typeCheckLimits.instantiationChildLimit;
+        typeChecker.unifierIterationLimit = typeCheckLimits.unifierIterationLimit;
+
+        return typeChecker.check(sourceModule, mode, environmentScope);
+    }
 }

 // Read AST into sourceModules if necessary.  Trace require()s.  Report parse errors.
--- a/Analysis/src/Module.cpp
+++ b/Analysis/src/Module.cpp
@ -20,6 +20,7 @@ LUAU_FASTFLAGVARIABLE(LuauClonePublicInterfaceLess2, false);
 LUAU_FASTFLAG(LuauSubstitutionReentrant);
 LUAU_FASTFLAG(LuauClassTypeVarsInSubstitution);
 LUAU_FASTFLAG(LuauSubstitutionFixMissingFields);
+LUAU_FASTFLAGVARIABLE(LuauCopyExportedTypes, false);

 namespace Luau
 {
@ -37,14 +38,14 @@ static bool contains(Position pos, Comment comment)
        return false;
 }

-bool isWithinComment(const SourceModule& sourceModule, Position pos)
+static bool isWithinComment(const std::vector<Comment>& commentLocations, Position pos)
 {
-    auto iter = std::lower_bound(sourceModule.commentLocations.begin(), sourceModule.commentLocations.end(),
-        Comment{Lexeme::Comment, Location{pos, pos}}, [](const Comment& a, const Comment& b) {
+    auto iter = std::lower_bound(
+        commentLocations.begin(), commentLocations.end(), Comment{Lexeme::Comment, Location{pos, pos}}, [](const Comment& a, const Comment& b) {
            return a.location.end < b.location.end;
        });

-    if (iter == sourceModule.commentLocations.end())
+    if (iter == commentLocations.end())
        return false;

    if (contains(pos, *iter))
@ -53,12 +54,22 @@ bool isWithinComment(const SourceModule& sourceModule, Position pos)
    // Due to the nature of std::lower_bound, it is possible that iter points at a comment that ends
    // at pos.  We'll try the next comment, if it exists.
    ++iter;
-    if (iter == sourceModule.commentLocations.end())
+    if (iter == commentLocations.end())
        return false;

    return contains(pos, *iter);
 }

+bool isWithinComment(const SourceModule& sourceModule, Position pos)
+{
+    return isWithinComment(sourceModule.commentLocations, pos);
+}
+
+bool isWithinComment(const ParseResult& result, Position pos)
+{
+    return isWithinComment(result.commentLocations, pos);
+}
+
 struct ClonePublicInterface : Substitution
 {
    NotNull<BuiltinTypes> builtinTypes;
@ -227,7 +238,7 @@ void Module::clonePublicInterface(NotNull<BuiltinTypes> builtinTypes, InternalEr

    // Copy external stuff over to Module itself
    this->returnType = moduleScope->returnType;
-    if (FFlag::DebugLuauDeferredConstraintResolution)
+    if (FFlag::DebugLuauDeferredConstraintResolution || FFlag::LuauCopyExportedTypes)
        this->exportedTypeBindings = moduleScope->exportedTypeBindings;
    else
        this->exportedTypeBindings = std::move(moduleScope->exportedTypeBindings);
--- a/Analysis/src/Type.cpp
+++ b/Analysis/src/Type.cpp
@ -337,7 +337,16 @@ bool isSubset(const UnionType& super, const UnionType& sub)

    return true;
 }
+bool hasPrimitiveTypeInIntersection(TypeId ty, PrimitiveType::Type primTy)
+{
+    TypeId tf = follow(ty);
+    if (isPrim(tf, primTy))
+        return true;

+    for (auto t : flattenIntersection(tf))
+        return isPrim(follow(t), primTy);
+    return false;
+}
 // When typechecking an assignment `x = e`, we typecheck `x:T` and `e:U`,
 // then instantiate U if `isGeneric(U)` is true, and `maybeGeneric(T)` is false.
 bool isGeneric(TypeId ty)
--- a/Analysis/src/TypeChecker2.cpp
+++ b/Analysis/src/TypeChecker2.cpp
@ -1160,11 +1160,7 @@ struct TypeChecker2
        visit(expr, RValue);

        TypeId leftType = stripFromNilAndReport(lookupType(expr), location);
-        const NormalizedType* norm = normalizer.normalize(leftType);
-        if (!norm)
-            reportError(NormalizationTooComplex{}, location);
-
-        checkIndexTypeFromType(leftType, *norm, propName, location, context);
+        checkIndexTypeFromType(leftType, propName, location, context);
    }

    void visit(AstExprIndexName* indexName, ValueContext context)
@ -2033,8 +2029,16 @@ struct TypeChecker2
            reportError(std::move(e));
    }

-    void checkIndexTypeFromType(TypeId tableTy, const NormalizedType& norm, const std::string& prop, const Location& location, ValueContext context)
+    // If the provided type does not have the named property, report an error.
+    void checkIndexTypeFromType(TypeId tableTy, const std::string& prop, const Location& location, ValueContext context)
    {
+        const NormalizedType* norm = normalizer.normalize(tableTy);
+        if (!norm)
+        {
+            reportError(NormalizationTooComplex{}, location);
+            return;
+        }
+
        bool foundOneProp = false;
        std::vector<TypeId> typesMissingTheProp;

@ -2042,49 +2046,50 @@ struct TypeChecker2
            if (!normalizer.isInhabited(ty))
                return;

-            bool found = hasIndexTypeFromType(ty, prop, location);
+            std::unordered_set<TypeId> seen;
+            bool found = hasIndexTypeFromType(ty, prop, location, seen);
            foundOneProp |= found;
            if (!found)
                typesMissingTheProp.push_back(ty);
        };

-        fetch(norm.tops);
-        fetch(norm.booleans);
+        fetch(norm->tops);
+        fetch(norm->booleans);

        if (FFlag::LuauNegatedClassTypes)
        {
-            for (const auto& [ty, _negations] : norm.classes.classes)
+            for (const auto& [ty, _negations] : norm->classes.classes)
            {
                fetch(ty);
            }
        }
        else
        {
-            for (TypeId ty : norm.DEPRECATED_classes)
+            for (TypeId ty : norm->DEPRECATED_classes)
                fetch(ty);
        }
-        fetch(norm.errors);
-        fetch(norm.nils);
-        fetch(norm.numbers);
-        if (!norm.strings.isNever())
+        fetch(norm->errors);
+        fetch(norm->nils);
+        fetch(norm->numbers);
+        if (!norm->strings.isNever())
            fetch(builtinTypes->stringType);
-        fetch(norm.threads);
-        for (TypeId ty : norm.tables)
+        fetch(norm->threads);
+        for (TypeId ty : norm->tables)
            fetch(ty);
-        if (norm.functions.isTop)
+        if (norm->functions.isTop)
            fetch(builtinTypes->functionType);
-        else if (!norm.functions.isNever())
+        else if (!norm->functions.isNever())
        {
-            if (norm.functions.parts.size() == 1)
-                fetch(norm.functions.parts.front());
+            if (norm->functions.parts.size() == 1)
+                fetch(norm->functions.parts.front());
            else
            {
                std::vector<TypeId> parts;
-                parts.insert(parts.end(), norm.functions.parts.begin(), norm.functions.parts.end());
+                parts.insert(parts.end(), norm->functions.parts.begin(), norm->functions.parts.end());
                fetch(testArena.addType(IntersectionType{std::move(parts)}));
            }
        }
-        for (const auto& [tyvar, intersect] : norm.tyvars)
+        for (const auto& [tyvar, intersect] : norm->tyvars)
        {
            if (get<NeverType>(intersect->tops))
            {
@ -2110,8 +2115,15 @@ struct TypeChecker2
        }
    }

-    bool hasIndexTypeFromType(TypeId ty, const std::string& prop, const Location& location)
+    bool hasIndexTypeFromType(TypeId ty, const std::string& prop, const Location& location, std::unordered_set<TypeId>& seen)
    {
+        // If we have already encountered this type, we must assume that some
+        // other codepath will do the right thing and signal false if the
+        // property is not present.
+        const bool isUnseen = seen.insert(ty).second;
+        if (!isUnseen)
+            return true;
+
        if (get<ErrorType>(ty) || get<AnyType>(ty) || get<NeverType>(ty))
            return true;

@ -2136,10 +2148,12 @@ struct TypeChecker2
        else if (const ClassType* cls = get<ClassType>(ty))
            return bool(lookupClassProp(cls, prop));
        else if (const UnionType* utv = get<UnionType>(ty))
-            ice.ice("getIndexTypeFromTypeHelper cannot take a UnionType");
+            return std::all_of(begin(utv), end(utv), [&](TypeId part) {
+                return hasIndexTypeFromType(part, prop, location, seen);
+            });
        else if (const IntersectionType* itv = get<IntersectionType>(ty))
            return std::any_of(begin(itv), end(itv), [&](TypeId part) {
-                return hasIndexTypeFromType(part, prop, location);
+                return hasIndexTypeFromType(part, prop, location, seen);
            });
        else
            return false;
--- a/Analysis/src/TypeInfer.cpp
+++ b/Analysis/src/TypeInfer.cpp
@ -35,14 +35,13 @@ LUAU_FASTFLAG(LuauKnowsTheDataModel3)
 LUAU_FASTFLAGVARIABLE(DebugLuauFreezeDuringUnification, false)
 LUAU_FASTFLAGVARIABLE(LuauReturnAnyInsteadOfICE, false) // Eventually removed as false.
 LUAU_FASTFLAGVARIABLE(DebugLuauSharedSelf, false)
-LUAU_FASTFLAGVARIABLE(LuauTryhardAnd, false)
 LUAU_FASTFLAG(LuauInstantiateInSubtyping)
 LUAU_FASTFLAG(LuauNegatedClassTypes)
 LUAU_FASTFLAGVARIABLE(LuauAllowIndexClassParameters, false)
 LUAU_FASTFLAG(LuauUninhabitedSubAnything2)
+LUAU_FASTFLAG(LuauOccursIsntAlwaysFailure)
 LUAU_FASTFLAGVARIABLE(LuauTypecheckTypeguards, false)
 LUAU_FASTFLAGVARIABLE(LuauTinyControlFlowAnalysis, false)
-LUAU_FASTFLAGVARIABLE(LuauReducingAndOr, false)

 namespace Luau
 {
@ -1623,9 +1622,28 @@ ControlFlow TypeChecker::check(const ScopePtr& scope, const AstStatTypeAlias& ty

    TypeId& bindingType = bindingsMap[name].type;

-    if (unify(ty, bindingType, aliasScope, typealias.location))
-        bindingType = ty;
+    if (!FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        if (unify(ty, bindingType, aliasScope, typealias.location))
+            bindingType = ty;
+        return ControlFlow::None;
+    }

+    unify(ty, bindingType, aliasScope, typealias.location);
+
+    // It is possible for this unification to succeed but for
+    // `bindingType` still to be free For example, in
+    // `type T = T|T`, we generate a fresh free type `X`, and then
+    // unify `X` with `X|X`, which succeeds without binding `X` to
+    // anything, since `X <: X|X`
+    if (bindingType->ty.get_if<FreeType>())
+    {
+        ty = errorRecoveryType(aliasScope);
+        unify(ty, bindingType, aliasScope, typealias.location);
+        reportError(TypeError{typealias.location, OccursCheckFailed{}});
+    }
+
+    bindingType = ty;
    return ControlFlow::None;
 }

@ -2848,7 +2866,7 @@ TypeId TypeChecker::checkRelationalOperation(
        {
            return lhsType;
        }
-        else if (FFlag::LuauTryhardAnd)
+        else
        {
            // If lhs is free, we can't tell which 'falsy' components it has, if any
            if (get<FreeType>(lhsType))
@ -2860,14 +2878,11 @@ TypeId TypeChecker::checkRelationalOperation(
            {
                LUAU_ASSERT(oty);

-                if (FFlag::LuauReducingAndOr)
-                {
-                    // Perform a limited form of type reduction for booleans
-                    if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
-                        return booleanType;
-                    if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
-                        return booleanType;
-                }
+                // Perform a limited form of type reduction for booleans
+                if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
+                    return booleanType;
+                if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
+                    return booleanType;

                return unionOfTypes(*oty, rhsType, scope, expr.location, false);
            }
@ -2876,16 +2891,12 @@ TypeId TypeChecker::checkRelationalOperation(
                return rhsType;
            }
        }
-        else
-        {
-            return unionOfTypes(rhsType, booleanType, scope, expr.location, false);
-        }
    case AstExprBinary::Or:
        if (lhsIsAny)
        {
            return lhsType;
        }
-        else if (FFlag::LuauTryhardAnd)
+        else
        {
            auto [oty, notNever] = pickTypesFromSense(lhsType, true, neverType); // Filter out truthy types

@ -2893,14 +2904,11 @@ TypeId TypeChecker::checkRelationalOperation(
            {
                LUAU_ASSERT(oty);

-                if (FFlag::LuauReducingAndOr)
-                {
-                    // Perform a limited form of type reduction for booleans
-                    if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
-                        return booleanType;
-                    if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
-                        return booleanType;
-                }
+                // Perform a limited form of type reduction for booleans
+                if (isPrim(*oty, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(rhsType))))
+                    return booleanType;
+                if (isPrim(rhsType, PrimitiveType::Boolean) && get<BooleanSingleton>(get<SingletonType>(follow(*oty))))
+                    return booleanType;

                return unionOfTypes(*oty, rhsType, scope, expr.location);
            }
@ -2909,10 +2917,6 @@ TypeId TypeChecker::checkRelationalOperation(
                return rhsType;
            }
        }
-        else
-        {
-            return unionOfTypes(lhsType, rhsType, scope, expr.location);
-        }
    default:
        LUAU_ASSERT(0);
        ice(format("checkRelationalOperation called with incorrect binary expression '%s'", toString(expr.op).c_str()), expr.location);
--- a/Analysis/src/Unifier.cpp
+++ b/Analysis/src/Unifier.cpp
@ -19,8 +19,10 @@ LUAU_FASTINT(LuauTypeInferTypePackLoopLimit)
 LUAU_FASTFLAG(LuauErrorRecoveryType)
 LUAU_FASTFLAGVARIABLE(LuauInstantiateInSubtyping, false)
 LUAU_FASTFLAGVARIABLE(LuauUninhabitedSubAnything2, false)
+LUAU_FASTFLAGVARIABLE(LuauVariadicAnyCanBeGeneric, false)
 LUAU_FASTFLAGVARIABLE(LuauMaintainScopesInUnifier, false)
 LUAU_FASTFLAGVARIABLE(LuauTransitiveSubtyping, false)
+LUAU_FASTFLAGVARIABLE(LuauOccursIsntAlwaysFailure, false)
 LUAU_FASTFLAG(LuauClassTypeVarsInSubstitution)
 LUAU_FASTFLAG(DebugLuauDeferredConstraintResolution)
 LUAU_FASTFLAG(LuauNormalizeBlockedTypes)
@ -431,14 +433,14 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool

    if (superFree && subFree && subsumes(useScopes, superFree, subFree))
    {
-        if (!occursCheck(subTy, superTy))
+        if (!occursCheck(subTy, superTy, /* reversed = */ false))
            log.replace(subTy, BoundType(superTy));

        return;
    }
    else if (superFree && subFree)
    {
-        if (!occursCheck(superTy, subTy))
+        if (!occursCheck(superTy, subTy, /* reversed = */ true))
        {
            if (subsumes(useScopes, superFree, subFree))
            {
@ -461,7 +463,7 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool
            return;
        }

-        if (!occursCheck(superTy, subTy))
+        if (!occursCheck(superTy, subTy, /* reversed = */ true))
        {
            promoteTypeLevels(log, types, superFree->level, superFree->scope, useScopes, subTy);

@ -487,7 +489,7 @@ void Unifier::tryUnify_(TypeId subTy, TypeId superTy, bool isFunctionCall, bool
            return;
        }

-        if (!occursCheck(subTy, superTy))
+        if (!occursCheck(subTy, superTy, /* reversed = */ false))
        {
            promoteTypeLevels(log, types, subFree->level, subFree->scope, useScopes, superTy);
            log.replace(subTy, BoundType(superTy));
@ -1593,7 +1595,7 @@ void Unifier::tryUnify_(TypePackId subTp, TypePackId superTp, bool isFunctionCal

    if (log.getMutable<FreeTypePack>(superTp))
    {
-        if (!occursCheck(superTp, subTp))
+        if (!occursCheck(superTp, subTp, /* reversed = */ true))
        {
            Widen widen{types, builtinTypes};
            log.replace(superTp, Unifiable::Bound<TypePackId>(widen(subTp)));
@ -1601,7 +1603,7 @@ void Unifier::tryUnify_(TypePackId subTp, TypePackId superTp, bool isFunctionCal
    }
    else if (log.getMutable<FreeTypePack>(subTp))
    {
-        if (!occursCheck(subTp, superTp))
+        if (!occursCheck(subTp, superTp, /* reversed = */ false))
        {
            log.replace(subTp, Unifiable::Bound<TypePackId>(superTp));
        }
@ -2585,13 +2587,14 @@ static void queueTypePack(std::vector<TypeId>& queue, DenseHashSet<TypePackId>&
 void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool reversed, int subOffset)
 {
    const VariadicTypePack* superVariadic = log.getMutable<VariadicTypePack>(superTp);
+    const TypeId variadicTy = follow(superVariadic->ty);

    if (!superVariadic)
        ice("passed non-variadic pack to tryUnifyVariadics");

    if (const VariadicTypePack* subVariadic = log.get<VariadicTypePack>(subTp))
    {
-        tryUnify_(reversed ? superVariadic->ty : subVariadic->ty, reversed ? subVariadic->ty : superVariadic->ty);
+        tryUnify_(reversed ? variadicTy : subVariadic->ty, reversed ? subVariadic->ty : variadicTy);
    }
    else if (log.get<TypePack>(subTp))
    {
@ -2602,7 +2605,7 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever

        while (subIter != subEnd)
        {
-            tryUnify_(reversed ? superVariadic->ty : *subIter, reversed ? *subIter : superVariadic->ty);
+            tryUnify_(reversed ? variadicTy : *subIter, reversed ? *subIter : variadicTy);
            ++subIter;
        }

@ -2615,7 +2618,7 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever
            }
            else if (const VariadicTypePack* vtp = get<VariadicTypePack>(tail))
            {
-                tryUnify_(vtp->ty, superVariadic->ty);
+                tryUnify_(vtp->ty, variadicTy);
            }
            else if (get<GenericTypePack>(tail))
            {
@ -2631,6 +2634,10 @@ void Unifier::tryUnifyVariadics(TypePackId subTp, TypePackId superTp, bool rever
            }
        }
    }
+    else if (FFlag::LuauVariadicAnyCanBeGeneric && get<AnyType>(variadicTy) && log.get<GenericTypePack>(subTp))
+    {
+        // Nothing to do.  This is ok.
+    }
    else
    {
        reportError(location, GenericError{"Failed to unify variadic packs"});
@ -2751,11 +2758,42 @@ TxnLog Unifier::combineLogsIntoUnion(std::vector<TxnLog> logs)
    return result;
 }

-bool Unifier::occursCheck(TypeId needle, TypeId haystack)
+bool Unifier::occursCheck(TypeId needle, TypeId haystack, bool reversed)
 {
    sharedState.tempSeenTy.clear();

-    return occursCheck(sharedState.tempSeenTy, needle, haystack);
+    bool occurs = occursCheck(sharedState.tempSeenTy, needle, haystack);
+
+    if (occurs && FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        Unifier innerState = makeChildUnifier();
+        if (const UnionType* ut = get<UnionType>(haystack))
+        {
+            if (reversed)
+                innerState.tryUnifyUnionWithType(haystack, ut, needle);
+            else
+                innerState.tryUnifyTypeWithUnion(needle, haystack, ut, /* cacheEnabled = */ false, /* isFunction = */ false);
+        }
+        else if (const IntersectionType* it = get<IntersectionType>(haystack))
+        {
+            if (reversed)
+                innerState.tryUnifyIntersectionWithType(haystack, it, needle, /* cacheEnabled = */ false, /* isFunction = */ false);
+            else
+                innerState.tryUnifyTypeWithIntersection(needle, haystack, it);
+        }
+        else
+        {
+            innerState.failure = true;
+        }
+
+        if (innerState.failure)
+        {
+            reportError(location, OccursCheckFailed{});
+            log.replace(needle, *builtinTypes->errorRecoveryType());
+        }
+    }
+
+    return occurs;
 }

 bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId haystack)
@ -2785,8 +2823,11 @@ bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId hays

    if (needle == haystack)
    {
-        reportError(location, OccursCheckFailed{});
-        log.replace(needle, *builtinTypes->errorRecoveryType());
+        if (!FFlag::LuauOccursIsntAlwaysFailure)
+        {
+            reportError(location, OccursCheckFailed{});
+            log.replace(needle, *builtinTypes->errorRecoveryType());
+        }

        return true;
    }
@ -2807,11 +2848,19 @@ bool Unifier::occursCheck(DenseHashSet<TypeId>& seen, TypeId needle, TypeId hays
    return occurrence;
 }

-bool Unifier::occursCheck(TypePackId needle, TypePackId haystack)
+bool Unifier::occursCheck(TypePackId needle, TypePackId haystack, bool reversed)
 {
    sharedState.tempSeenTp.clear();

-    return occursCheck(sharedState.tempSeenTp, needle, haystack);
+    bool occurs = occursCheck(sharedState.tempSeenTp, needle, haystack);
+
+    if (occurs && FFlag::LuauOccursIsntAlwaysFailure)
+    {
+        reportError(location, OccursCheckFailed{});
+        log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+    }
+
+    return occurs;
 }

 bool Unifier::occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, TypePackId haystack)
@ -2836,8 +2885,11 @@ bool Unifier::occursCheck(DenseHashSet<TypePackId>& seen, TypePackId needle, Typ
    {
        if (needle == haystack)
        {
-            reportError(location, OccursCheckFailed{});
-            log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+            if (!FFlag::LuauOccursIsntAlwaysFailure)
+            {
+                reportError(location, OccursCheckFailed{});
+                log.replace(needle, *builtinTypes->errorRecoveryTypePack());
+            }

            return true;
        }
--- a/Ast/src/StringUtils.cpp
+++ b/Ast/src/StringUtils.cpp
@ -167,7 +167,9 @@ size_t editDistance(std::string_view a, std::string_view b)

        for (size_t y = 1; y <= b.size(); ++y)
        {
-            size_t x1 = seenCharToRow[b[y - 1]];
+            // The value of b[N] can be negative with unicode characters
+            unsigned char bSeenCharIndex = static_cast<unsigned char>(b[y - 1]);
+            size_t x1 = seenCharToRow[bSeenCharIndex];
            size_t y1 = lastMatchedY;

            size_t cost = 1;
@ -187,7 +189,9 @@ size_t editDistance(std::string_view a, std::string_view b)
            distances[getPos(x + 1, y + 1)] = std::min(std::min(insertion, deletion), std::min(substitution, transposition));
        }

-        seenCharToRow[a[x - 1]] = x;
+        // The value of a[N] can be negative with unicode characters
+        unsigned char aSeenCharIndex = static_cast<unsigned char>(a[x - 1]);
+        seenCharToRow[aSeenCharIndex] = x;
    }

    return distances[getPos(a.size() + 1, b.size() + 1)];
--- a/CodeGen/include/Luau/AddressA64.h
+++ b/CodeGen/include/Luau/AddressA64.h
@ -29,7 +29,7 @@ struct AddressA64
    // For example, ldr x0, [reg+imm] is limited to 8 KB offsets assuming imm is divisible by 8, but loading into w0 reduces the range to 4 KB
    static constexpr size_t kMaxOffset = 1023;

-    AddressA64(RegisterA64 base, int off = 0)
+    constexpr AddressA64(RegisterA64 base, int off = 0)
        : kind(AddressKindA64::imm)
        , base(base)
        , offset(xzr)
@ -38,7 +38,7 @@ struct AddressA64
        LUAU_ASSERT(base.kind == KindA64::x || base == sp);
    }

-    AddressA64(RegisterA64 base, RegisterA64 offset)
+    constexpr AddressA64(RegisterA64 base, RegisterA64 offset)
        : kind(AddressKindA64::reg)
        , base(base)
        , offset(offset)
--- a/CodeGen/include/Luau/AssemblyBuilderA64.h
+++ b/CodeGen/include/Luau/AssemblyBuilderA64.h
@ -49,17 +49,25 @@ public:
    void cmp(RegisterA64 src1, RegisterA64 src2);
    void cmp(RegisterA64 src1, uint16_t src2);
    void csel(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond);
+    void cset(RegisterA64 dst, ConditionA64 cond);

    // Bitwise
-    // TODO: support immediate arguments (they have odd encoding and forbid many values)
-    // TODO: support bic (andnot)
    // TODO: support shifts
    // TODO: support bitfield ops
    void and_(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
    void orr(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
    void eor(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
+    void bic(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
+    void tst(RegisterA64 src1, RegisterA64 src2);
    void mvn(RegisterA64 dst, RegisterA64 src);

+    // Bitwise with immediate
+    // Note: immediate must have a single contiguous sequence of 1 bits set of length 1..31
+    void and_(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void orr(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void eor(RegisterA64 dst, RegisterA64 src1, uint32_t src2);
+    void tst(RegisterA64 src1, uint32_t src2);
+
    // Shifts
    void lsl(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
    void lsr(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2);
@ -168,7 +176,7 @@ public:
 private:
    // Instruction archetypes
    void place0(const char* name, uint32_t word);
-    void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0);
+    void placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift = 0, int N = 0);
    void placeSR2(const char* name, RegisterA64 dst, RegisterA64 src, uint8_t op, uint8_t op2 = 0);
    void placeR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t op2);
    void placeR1(const char* name, RegisterA64 dst, RegisterA64 src, uint32_t op);
@ -181,8 +189,9 @@ private:
    void placeADR(const char* name, RegisterA64 src, uint8_t op);
    void placeADR(const char* name, RegisterA64 src, uint8_t op, Label& label);
    void placeP(const char* name, RegisterA64 dst1, RegisterA64 dst2, AddressA64 src, uint8_t op, uint8_t opc, int sizelog);
-    void placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc);
+    void placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc, int invert = 0);
    void placeFCMP(const char* name, RegisterA64 src1, RegisterA64 src2, uint8_t op, uint8_t opc);
+    void placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op);

    void place(uint32_t word);

--- a/CodeGen/include/Luau/IrCallWrapperX64.h
+++ b/CodeGen/include/Luau/IrCallWrapperX64.h
@ -41,12 +41,14 @@ public:

    void call(const OperandX64& func);

+    RegisterX64 suggestNextArgumentRegister(SizeX64 size) const;
+
    IrRegAllocX64& regs;
    AssemblyBuilderX64& build;
    uint32_t instIdx = ~0u;

 private:
-    void assignTargetRegisters();
+    OperandX64 getNextArgumentTarget(SizeX64 size) const;
    void countRegisterUses();
    CallArgument* findNonInterferingArgument();
    bool interferesWithOperand(const OperandX64& op, RegisterX64 reg) const;
@ -67,6 +69,9 @@ private:
    std::array<CallArgument, kMaxCallArguments> args;
    int argCount = 0;

+    int gprPos = 0;
+    int xmmPos = 0;
+
    OperandX64 funcOp;

    // Internal counters for remaining register use counts
--- a/CodeGen/include/Luau/IrData.h
+++ b/CodeGen/include/Luau/IrData.h
@ -155,7 +155,7 @@ enum class IrCmd : uint8_t

    // Compute Luau 'not' operation on destructured TValue
    // A: tag
-    // B: double
+    // B: int (value)
    NOT_ANY, // TODO: boolean specialization will be useful

    // Unconditional jump
@ -233,7 +233,7 @@ enum class IrCmd : uint8_t

    // Try to get pointer to tag method TValue inside the table's metatable or jump if there is no such value or metatable
    // A: table
-    // B: int
+    // B: int (TMS enum)
    // C: block
    TRY_CALL_FASTGETTM,

@ -256,8 +256,8 @@ enum class IrCmd : uint8_t
    // B: Rn (result start)
    // C: Rn (argument start)
    // D: Rn or Kn or a boolean that's false (optional second argument)
-    // E: int (argument count or -1 to use all arguments up to stack top)
-    // F: int (result count or -1 to preserve all results and adjust stack top)
+    // E: int (argument count)
+    // F: int (result count)
    FASTCALL,

    // Call the fastcall builtin function
@ -517,8 +517,10 @@ enum class IrCmd : uint8_t
    FALLBACK_FORGPREP,

    // Instruction that passes value through, it is produced by constant folding and users substitute it with the value
+    // When operand location is set, updates the tracked location of the value in memory
    SUBSTITUTE,
    // A: operand of any type
+    // B: Rn/Kn/none (location of operand in memory; optional)
 };

 enum class IrConstKind : uint8_t
@ -694,6 +696,9 @@ struct IrFunction

    std::vector<BytecodeMapping> bcMapping;

+    // For each instruction, an operand that can be used to recompute the calue
+    std::vector<IrOp> valueRestoreOps;
+
    Proto* proto = nullptr;

    CfgInfo cfg;
@ -829,19 +834,40 @@ struct IrFunction
        return value.valueDouble;
    }

-    uint32_t getBlockIndex(const IrBlock& block)
+    uint32_t getBlockIndex(const IrBlock& block) const
    {
        // Can only be called with blocks from our vector
        LUAU_ASSERT(&block >= blocks.data() && &block <= blocks.data() + blocks.size());
        return uint32_t(&block - blocks.data());
    }

-    uint32_t getInstIndex(const IrInst& inst)
+    uint32_t getInstIndex(const IrInst& inst) const
    {
        // Can only be called with instructions from our vector
        LUAU_ASSERT(&inst >= instructions.data() && &inst <= instructions.data() + instructions.size());
        return uint32_t(&inst - instructions.data());
    }
+
+    void recordRestoreOp(uint32_t instIdx, IrOp location)
+    {
+        if (instIdx >= valueRestoreOps.size())
+            valueRestoreOps.resize(instIdx + 1);
+
+        valueRestoreOps[instIdx] = location;
+    }
+
+    IrOp findRestoreOp(uint32_t instIdx) const
+    {
+        if (instIdx >= valueRestoreOps.size())
+            return {};
+
+        return valueRestoreOps[instIdx];
+    }
+
+    IrOp findRestoreOp(const IrInst& inst) const
+    {
+        return findRestoreOp(getInstIndex(inst));
+    }
 };

 inline IrCondition conditionOp(IrOp op)
--- a/CodeGen/include/Luau/IrRegAllocX64.h
+++ b/CodeGen/include/Luau/IrRegAllocX64.h
@ -20,7 +20,9 @@ constexpr uint8_t kNoStackSlot = 0xff;
 struct IrSpillX64
 {
    uint32_t instIdx = 0;
-    bool useDoubleSlot = 0;
+    IrValueKind valueKind = IrValueKind::Unknown;
+
+    unsigned spillId = 0;

    // Spill location can be a stack location or be empty
    // When it's empty, it means that instruction value can be rematerialized
@ -33,12 +35,8 @@ struct IrRegAllocX64
 {
    IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function);

-    RegisterX64 allocGprReg(SizeX64 preferredSize, uint32_t instIdx);
-    RegisterX64 allocXmmReg(uint32_t instIdx);
-
-    RegisterX64 allocGprRegOrReuse(SizeX64 preferredSize, uint32_t instIdx, std::initializer_list<IrOp> oprefs);
-    RegisterX64 allocXmmRegOrReuse(uint32_t instIdx, std::initializer_list<IrOp> oprefs);
-
+    RegisterX64 allocReg(SizeX64 size, uint32_t instIdx);
+    RegisterX64 allocRegOrReuse(SizeX64 size, uint32_t instIdx, std::initializer_list<IrOp> oprefs);
    RegisterX64 takeReg(RegisterX64 reg, uint32_t instIdx);

    void freeReg(RegisterX64 reg);
@ -49,6 +47,12 @@ struct IrRegAllocX64

    bool shouldFreeGpr(RegisterX64 reg) const;

+    unsigned findSpillStackSlot(IrValueKind valueKind);
+
+    IrOp getRestoreOp(const IrInst& inst) const;
+    bool hasRestoreOp(const IrInst& inst) const;
+    OperandX64 getRestoreAddress(const IrInst& inst, IrOp restoreOp);
+
    // Register used by instruction is about to be freed, have to find a way to restore value later
    void preserve(IrInst& inst);

@ -74,6 +78,7 @@ struct IrRegAllocX64

    std::bitset<256> usedSpillSlots;
    unsigned maxUsedSlot = 0;
+    unsigned nextSpillId = 1;
    std::vector<IrSpillX64> spills;
 };

@ -107,10 +112,8 @@ struct ScopedSpills
    ScopedSpills(const ScopedSpills&) = delete;
    ScopedSpills& operator=(const ScopedSpills&) = delete;

-    bool wasSpilledBefore(const IrSpillX64& spill) const;
-
    IrRegAllocX64& owner;
-    std::vector<IrSpillX64> snapshot;
+    unsigned startSpillId = 0;
 };

 } // namespace X64
--- a/CodeGen/include/Luau/IrUtils.h
+++ b/CodeGen/include/Luau/IrUtils.h
@ -200,7 +200,7 @@ void replace(IrFunction& function, IrOp& original, IrOp replacement);
 void replace(IrFunction& function, IrBlock& block, uint32_t instIdx, IrInst replacement);

 // Replace instruction with a different value (using IrCmd::SUBSTITUTE)
-void substitute(IrFunction& function, IrInst& inst, IrOp replacement);
+void substitute(IrFunction& function, IrInst& inst, IrOp replacement, IrOp location = {});

 // Replace instruction arguments that point to substitutions with target values
 void applySubstitutions(IrFunction& function, IrOp& op);
--- a/CodeGen/include/Luau/RegisterA64.h
+++ b/CodeGen/include/Luau/RegisterA64.h
@ -46,6 +46,18 @@ constexpr RegisterA64 castReg(KindA64 kind, RegisterA64 reg)
    return RegisterA64{kind, reg.index};
 }

+// This is equivalent to castReg(KindA64::x), but is separate because it implies different semantics
+// Specifically, there are cases when it's useful to treat a wN register as an xN register *after* it has been assigned a value
+// Since all A64 instructions that write to wN implicitly zero the top half, this works when we need zero extension semantics
+// Crucially, this is *not* safe on an ABI boundary - an int parameter in wN register may have anything in its top half in certain cases
+// However, as long as our codegen doesn't use 32-bit truncation by using castReg x=>w, we can safely rely on this.
+constexpr RegisterA64 zextReg(RegisterA64 reg)
+{
+    LUAU_ASSERT(reg.kind == KindA64::w);
+
+    return RegisterA64{KindA64::x, reg.index};
+}
+
 constexpr RegisterA64 noreg{KindA64::none, 0};

 constexpr RegisterA64 w0{KindA64::w, 0};
--- a/CodeGen/include/Luau/RegisterX64.h
+++ b/CodeGen/include/Luau/RegisterX64.h
@ -46,6 +46,18 @@ constexpr RegisterX64 al{SizeX64::byte, 0};
 constexpr RegisterX64 cl{SizeX64::byte, 1};
 constexpr RegisterX64 dl{SizeX64::byte, 2};
 constexpr RegisterX64 bl{SizeX64::byte, 3};
+constexpr RegisterX64 spl{SizeX64::byte, 4};
+constexpr RegisterX64 bpl{SizeX64::byte, 5};
+constexpr RegisterX64 sil{SizeX64::byte, 6};
+constexpr RegisterX64 dil{SizeX64::byte, 7};
+constexpr RegisterX64 r8b{SizeX64::byte, 8};
+constexpr RegisterX64 r9b{SizeX64::byte, 9};
+constexpr RegisterX64 r10b{SizeX64::byte, 10};
+constexpr RegisterX64 r11b{SizeX64::byte, 11};
+constexpr RegisterX64 r12b{SizeX64::byte, 12};
+constexpr RegisterX64 r13b{SizeX64::byte, 13};
+constexpr RegisterX64 r14b{SizeX64::byte, 14};
+constexpr RegisterX64 r15b{SizeX64::byte, 15};

 constexpr RegisterX64 eax{SizeX64::dword, 0};
 constexpr RegisterX64 ecx{SizeX64::dword, 1};
--- a/CodeGen/include/Luau/UnwindBuilder.h
+++ b/CodeGen/include/Luau/UnwindBuilder.h
@ -11,6 +11,9 @@ namespace Luau
 namespace CodeGen
 {

+// This value is used in 'finishFunction' to mark the function that spans to the end of the whole code block
+static uint32_t kFullBlockFuncton = ~0u;
+
 class UnwindBuilder
 {
 public:
@ -19,19 +22,22 @@ public:
    virtual void setBeginOffset(size_t beginOffset) = 0;
    virtual size_t getBeginOffset() const = 0;

-    virtual void start() = 0;
+    virtual void startInfo() = 0;

+    virtual void startFunction() = 0;
    virtual void spill(int espOffset, X64::RegisterX64 reg) = 0;
    virtual void save(X64::RegisterX64 reg) = 0;
    virtual void allocStack(int size) = 0;
    virtual void setupFrameReg(X64::RegisterX64 reg, int espOffset) = 0;
+    virtual void finishFunction(uint32_t beginOffset, uint32_t endOffset) = 0;

-    virtual void finish() = 0;
+    virtual void finishInfo() = 0;

    virtual size_t getSize() const = 0;
+    virtual size_t getFunctionCount() const = 0;

    // This will place the unwinding data at the target address and might update values of some fields
-    virtual void finalize(char* target, void* funcAddress, size_t funcSize) const = 0;
+    virtual void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const = 0;
 };

 } // namespace CodeGen
--- a/CodeGen/include/Luau/UnwindBuilderDwarf2.h
+++ b/CodeGen/include/Luau/UnwindBuilderDwarf2.h
@ -4,34 +4,48 @@
 #include "Luau/RegisterX64.h"
 #include "UnwindBuilder.h"

+#include <vector>
+
 namespace Luau
 {
 namespace CodeGen
 {

+struct UnwindFunctionDwarf2
+{
+    uint32_t beginOffset;
+    uint32_t endOffset;
+    uint32_t fdeEntryStartPos;
+};
+
 class UnwindBuilderDwarf2 : public UnwindBuilder
 {
 public:
    void setBeginOffset(size_t beginOffset) override;
    size_t getBeginOffset() const override;

-    void start() override;
+    void startInfo() override;

+    void startFunction() override;
    void spill(int espOffset, X64::RegisterX64 reg) override;
    void save(X64::RegisterX64 reg) override;
    void allocStack(int size) override;
    void setupFrameReg(X64::RegisterX64 reg, int espOffset) override;
+    void finishFunction(uint32_t beginOffset, uint32_t endOffset) override;

-    void finish() override;
+    void finishInfo() override;

    size_t getSize() const override;
+    size_t getFunctionCount() const override;

-    void finalize(char* target, void* funcAddress, size_t funcSize) const override;
+    void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const override;

 private:
    size_t beginOffset = 0;

-    static const unsigned kRawDataLimit = 128;
+    std::vector<UnwindFunctionDwarf2> unwindFunctions;
+
+    static const unsigned kRawDataLimit = 1024;
    uint8_t rawData[kRawDataLimit];
    uint8_t* pos = rawData;

--- a/CodeGen/include/Luau/UnwindBuilderWin.h
+++ b/CodeGen/include/Luau/UnwindBuilderWin.h
@ -11,6 +11,25 @@ namespace Luau
 namespace CodeGen
 {

+// This struct matches the layout of x64 RUNTIME_FUNCTION from winnt.h
+struct UnwindFunctionWin
+{
+    uint32_t beginOffset;
+    uint32_t endOffset;
+    uint32_t unwindInfoOffset;
+};
+
+// This struct matches the layout of x64 UNWIND_INFO from ehdata.h
+struct UnwindInfoWin
+{
+    uint8_t version : 3;
+    uint8_t flags : 5;
+    uint8_t prologsize;
+    uint8_t unwindcodecount;
+    uint8_t framereg : 4;
+    uint8_t frameregoff : 4;
+};
+
 // This struct matches the layout of UNWIND_CODE from ehdata.h
 struct UnwindCodeWin
 {
@ -25,31 +44,38 @@ public:
    void setBeginOffset(size_t beginOffset) override;
    size_t getBeginOffset() const override;

-    void start() override;
+    void startInfo() override;

+    void startFunction() override;
    void spill(int espOffset, X64::RegisterX64 reg) override;
    void save(X64::RegisterX64 reg) override;
    void allocStack(int size) override;
    void setupFrameReg(X64::RegisterX64 reg, int espOffset) override;
+    void finishFunction(uint32_t beginOffset, uint32_t endOffset) override;

-    void finish() override;
+    void finishInfo() override;

    size_t getSize() const override;
+    size_t getFunctionCount() const override;

-    void finalize(char* target, void* funcAddress, size_t funcSize) const override;
+    void finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const override;

 private:
    size_t beginOffset = 0;

+    static const unsigned kRawDataLimit = 1024;
+    uint8_t rawData[kRawDataLimit];
+    uint8_t* rawDataPos = rawData;
+
+    std::vector<UnwindFunctionWin> unwindFunctions;
+
    // Windows unwind codes are written in reverse, so we have to collect them all first
    std::vector<UnwindCodeWin> unwindCodes;

    uint8_t prologSize = 0;
-    X64::RegisterX64 frameReg = X64::rax; // rax means that frame register is not used
+    X64::RegisterX64 frameReg = X64::noreg;
    uint8_t frameRegOffset = 0;
    uint32_t stackOffset = 0;
-
-    size_t infoSize = 0;
 };

 } // namespace CodeGen
--- a/CodeGen/src/AssemblyBuilderA64.cpp
+++ b/CodeGen/src/AssemblyBuilderA64.cpp
@ -1,6 +1,7 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "Luau/AssemblyBuilderA64.h"

+#include "BitUtils.h"
 #include "ByteUtils.h"

 #include <stdarg.h>
@ -126,6 +127,15 @@ void AssemblyBuilderA64::csel(RegisterA64 dst, RegisterA64 src1, RegisterA64 src
    placeCS("csel", dst, src1, src2, cond, 0b11010'10'0, 0b00);
 }

+void AssemblyBuilderA64::cset(RegisterA64 dst, ConditionA64 cond)
+{
+    LUAU_ASSERT(dst.kind == KindA64::x || dst.kind == KindA64::w);
+
+    RegisterA64 src = dst.kind == KindA64::x ? xzr : wzr;
+
+    placeCS("cset", dst, src, src, cond, 0b11010'10'0, 0b01, /* invert= */ 1);
+}
+
 void AssemblyBuilderA64::and_(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
 {
    placeSR3("and", dst, src1, src2, 0b00'01010);
@ -141,11 +151,45 @@ void AssemblyBuilderA64::eor(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2
    placeSR3("eor", dst, src1, src2, 0b10'01010);
 }

+void AssemblyBuilderA64::bic(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
+{
+    placeSR3("bic", dst, src1, src2, 0b00'01010, /* shift= */ 0, /* N= */ 1);
+}
+
+void AssemblyBuilderA64::tst(RegisterA64 src1, RegisterA64 src2)
+{
+    RegisterA64 dst = src1.kind == KindA64::x ? xzr : wzr;
+
+    placeSR3("tst", dst, src1, src2, 0b11'01010);
+}
+
 void AssemblyBuilderA64::mvn(RegisterA64 dst, RegisterA64 src)
 {
    placeSR2("mvn", dst, src, 0b01'01010, 0b1);
 }

+void AssemblyBuilderA64::and_(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("and", dst, src1, src2, 0b00'100100);
+}
+
+void AssemblyBuilderA64::orr(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("orr", dst, src1, src2, 0b01'100100);
+}
+
+void AssemblyBuilderA64::eor(RegisterA64 dst, RegisterA64 src1, uint32_t src2)
+{
+    placeBM("eor", dst, src1, src2, 0b10'100100);
+}
+
+void AssemblyBuilderA64::tst(RegisterA64 src1, uint32_t src2)
+{
+    RegisterA64 dst = src1.kind == KindA64::x ? xzr : wzr;
+
+    placeBM("tst", dst, src1, src2, 0b11'100100);
+}
+
 void AssemblyBuilderA64::lsl(RegisterA64 dst, RegisterA64 src1, RegisterA64 src2)
 {
    placeR3("lsl", dst, src1, src2, 0b11010110, 0b0010'00);
@ -583,7 +627,7 @@ void AssemblyBuilderA64::place0(const char* name, uint32_t op)
    commit();
 }

-void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift)
+void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, uint8_t op, int shift, int N)
 {
    if (logText)
        log(name, dst, src1, src2, shift);
@ -594,7 +638,7 @@ void AssemblyBuilderA64::placeSR3(const char* name, RegisterA64 dst, RegisterA64

    uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;

-    place(dst.index | (src1.index << 5) | (shift << 10) | (src2.index << 16) | (op << 24) | sf);
+    place(dst.index | (src1.index << 5) | (shift << 10) | (src2.index << 16) | (N << 21) | (op << 24) | sf);
    commit();
 }

@ -764,7 +808,8 @@ void AssemblyBuilderA64::placeP(const char* name, RegisterA64 src1, RegisterA64
    commit();
 }

-void AssemblyBuilderA64::placeCS(const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc)
+void AssemblyBuilderA64::placeCS(
+    const char* name, RegisterA64 dst, RegisterA64 src1, RegisterA64 src2, ConditionA64 cond, uint8_t op, uint8_t opc, int invert)
 {
    if (logText)
        log(name, dst, src1, src2, cond);
@ -773,7 +818,7 @@ void AssemblyBuilderA64::placeCS(const char* name, RegisterA64 dst, RegisterA64

    uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;

-    place(dst.index | (src1.index << 5) | (opc << 10) | (codeForCondition[int(cond)] << 12) | (src2.index << 16) | (op << 21) | sf);
+    place(dst.index | (src1.index << 5) | (opc << 10) | ((codeForCondition[int(cond)] ^ invert) << 12) | (src2.index << 16) | (op << 21) | sf);
    commit();
 }

@ -793,6 +838,29 @@ void AssemblyBuilderA64::placeFCMP(const char* name, RegisterA64 src1, RegisterA
    commit();
 }

+void AssemblyBuilderA64::placeBM(const char* name, RegisterA64 dst, RegisterA64 src1, uint32_t src2, uint8_t op)
+{
+    if (logText)
+        log(name, dst, src1, src2);
+
+    LUAU_ASSERT(dst.kind == KindA64::w || dst.kind == KindA64::x);
+    LUAU_ASSERT(dst.kind == src1.kind);
+
+    uint32_t sf = (dst.kind == KindA64::x) ? 0x80000000 : 0;
+
+    int lz = countlz(src2);
+    int rz = countrz(src2);
+
+    LUAU_ASSERT(lz + rz > 0 && lz + rz < 32);                 // must have at least one 0 and at least one 1
+    LUAU_ASSERT((src2 >> rz) == (1u << (32 - lz - rz)) - 1u); // sequence of 1s must be contiguous
+
+    int imms = 31 - lz - rz;   // count of 1s minus 1
+    int immr = (32 - rz) & 31; // right rotate amount
+
+    place(dst.index | (src1.index << 5) | (imms << 10) | (immr << 16) | (op << 23) | sf);
+    commit();
+}
+
 void AssemblyBuilderA64::place(uint32_t word)
 {
    LUAU_ASSERT(codePos < codeEnd);
@ -965,10 +1033,13 @@ void AssemblyBuilderA64::log(const char* opcode, RegisterA64 dst, RegisterA64 sr
 {
    logAppend(" %-12s", opcode);
    log(dst);
-    text.append(",");
-    log(src1);
-    text.append(",");
-    log(src2);
+    if ((src1 != wzr && src1 != xzr) || (src2 != wzr && src2 != xzr))
+    {
+        text.append(",");
+        log(src1);
+        text.append(",");
+        log(src2);
+    }
    text.append(",");
    text.append(textForCondition[int(cond)] + 2); // skip b.
    text.append("\n");
--- a/CodeGen/src/AssemblyBuilderX64.cpp
+++ b/CodeGen/src/AssemblyBuilderX64.cpp
@ -31,7 +31,8 @@ static_assert(sizeof(setccTextForCondition) / sizeof(setccTextForCondition[0]) =
 #define OP_PLUS_REG(op, reg) ((op) + (reg & 0x7))
 #define OP_PLUS_CC(op, cc) ((op) + uint8_t(cc))

-#define REX_W(value) (value ? 0x8 : 0x0)
+#define REX_W_BIT(value) (value ? 0x8 : 0x0)
+#define REX_W(reg) REX_W_BIT((reg).size == SizeX64::qword || ((reg).size == SizeX64::byte && (reg).index >= 4))
 #define REX_R(reg) (((reg).index & 0x8) >> 1)
 #define REX_X(reg) (((reg).index & 0x8) >> 2)
 #define REX_B(reg) (((reg).index & 0x8) >> 3)
@ -1116,7 +1117,7 @@ void AssemblyBuilderX64::placeAvx(

 void AssemblyBuilderX64::placeRex(RegisterX64 op)
 {
-    uint8_t code = REX_W(op.size == SizeX64::qword) | REX_B(op);
+    uint8_t code = REX_W(op) | REX_B(op);

    if (code != 0)
        place(code | 0x40);
@ -1127,9 +1128,9 @@ void AssemblyBuilderX64::placeRex(OperandX64 op)
    uint8_t code = 0;

    if (op.cat == CategoryX64::reg)
-        code = REX_W(op.base.size == SizeX64::qword) | REX_B(op.base);
+        code = REX_W(op.base) | REX_B(op.base);
    else if (op.cat == CategoryX64::mem)
-        code = REX_W(op.memSize == SizeX64::qword) | REX_X(op.index) | REX_B(op.base);
+        code = REX_W_BIT(op.memSize == SizeX64::qword) | REX_X(op.index) | REX_B(op.base);
    else
        LUAU_ASSERT(!"No encoding for left operand of this category");

@ -1154,7 +1155,7 @@ void AssemblyBuilderX64::placeRexNoW(OperandX64 op)

 void AssemblyBuilderX64::placeRex(RegisterX64 lhs, OperandX64 rhs)
 {
-    uint8_t code = REX_W(lhs.size == SizeX64::qword);
+    uint8_t code = REX_W(lhs);

    if (rhs.cat == CategoryX64::imm)
        code |= REX_B(lhs);
--- a/CodeGen/src/BitUtils.h
+++ b/CodeGen/src/BitUtils.h
@ -0,0 +1,36 @@
+// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
+#pragma once
+
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace Luau
+{
+namespace CodeGen
+{
+
+inline int countlz(uint32_t n)
+{
+#ifdef _MSC_VER
+    unsigned long rl;
+    return _BitScanReverse(&rl, n) ? 31 - int(rl) : 32;
+#else
+    return n == 0 ? 32 : __builtin_clz(n);
+#endif
+}
+
+inline int countrz(uint32_t n)
+{
+#ifdef _MSC_VER
+    unsigned long rl;
+    return _BitScanForward(&rl, n) ? int(rl) : 32;
+#else
+    return n == 0 ? 32 : __builtin_ctz(n);
+#endif
+}
+
+} // namespace CodeGen
+} // namespace Luau
--- a/CodeGen/src/CodeBlockUnwind.cpp
+++ b/CodeGen/src/CodeBlockUnwind.cpp
@ -54,31 +54,6 @@ namespace CodeGen

 void* createBlockUnwindInfo(void* context, uint8_t* block, size_t blockSize, size_t& beginOffset)
 {
-#if defined(_WIN32) && defined(_M_X64)
-    UnwindBuilder* unwind = (UnwindBuilder*)context;
-
-    // All unwinding related data is placed together at the start of the block
-    size_t unwindSize = sizeof(RUNTIME_FUNCTION) + unwind->getSize();
-    unwindSize = (unwindSize + (kCodeAlignment - 1)) & ~(kCodeAlignment - 1); // Match code allocator alignment
-    LUAU_ASSERT(blockSize >= unwindSize);
-
-    RUNTIME_FUNCTION* runtimeFunc = (RUNTIME_FUNCTION*)block;
-    runtimeFunc->BeginAddress = DWORD(unwindSize);                    // Code will start after the unwind info
-    runtimeFunc->EndAddress = DWORD(blockSize);                       // Whole block is a part of a 'single function'
-    runtimeFunc->UnwindInfoAddress = DWORD(sizeof(RUNTIME_FUNCTION)); // Unwind info is placed at the start of the block
-
-    char* unwindData = (char*)block + runtimeFunc->UnwindInfoAddress;
-    unwind->finalize(unwindData, block + unwindSize, blockSize - unwindSize);
-
-    if (!RtlAddFunctionTable(runtimeFunc, 1, uintptr_t(block)))
-    {
-        LUAU_ASSERT(!"failed to allocate function table");
-        return nullptr;
-    }
-
-    beginOffset = unwindSize + unwind->getBeginOffset();
-    return block;
-#elif !defined(_WIN32)
    UnwindBuilder* unwind = (UnwindBuilder*)context;

    // All unwinding related data is placed together at the start of the block
@ -87,37 +62,34 @@ void* createBlockUnwindInfo(void* context, uint8_t* block, size_t blockSize, siz
    LUAU_ASSERT(blockSize >= unwindSize);

    char* unwindData = (char*)block;
-    unwind->finalize(unwindData, block, blockSize);
+    unwind->finalize(unwindData, unwindSize, block, blockSize);

-#if defined(__APPLE__)
+#if defined(_WIN32) && defined(_M_X64)
+    if (!RtlAddFunctionTable((RUNTIME_FUNCTION*)block, uint32_t(unwind->getFunctionCount()), uintptr_t(block)))
+    {
+        LUAU_ASSERT(!"failed to allocate function table");
+        return nullptr;
+    }
+#elif defined(__APPLE__)
    visitFdeEntries(unwindData, __register_frame);
-#else
+#elif !defined(_WIN32)
    __register_frame(unwindData);
 #endif

    beginOffset = unwindSize + unwind->getBeginOffset();
    return block;
-#endif
-
-    return nullptr;
 }

 void destroyBlockUnwindInfo(void* context, void* unwindData)
 {
 #if defined(_WIN32) && defined(_M_X64)
-    RUNTIME_FUNCTION* runtimeFunc = (RUNTIME_FUNCTION*)unwindData;
-
-    if (!RtlDeleteFunctionTable(runtimeFunc))
+    if (!RtlDeleteFunctionTable((RUNTIME_FUNCTION*)unwindData))
        LUAU_ASSERT(!"failed to deallocate function table");
-#elif !defined(_WIN32)
-
-#if defined(__APPLE__)
+#elif defined(__APPLE__)
    visitFdeEntries((char*)unwindData, __deregister_frame);
-#else
+#elif !defined(_WIN32)
    __deregister_frame(unwindData);
 #endif
-
-#endif
 }

 } // namespace CodeGen
--- a/CodeGen/src/CodeGen.cpp
+++ b/CodeGen/src/CodeGen.cpp
@ -176,6 +176,10 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&

            IrInst& inst = function.instructions[index];

+            // Substitutions might have meta information about operand restore location from memory
+            if (inst.cmd == IrCmd::SUBSTITUTE && inst.b.kind != IrOpKind::None)
+                function.recordRestoreOp(inst.a.index, inst.b);
+
            // Skip pseudo instructions, but make sure they are not used at this stage
            // This also prevents them from getting into text output when that's enabled
            if (isPseudo(inst.cmd))
@ -195,7 +199,18 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
            lowering.lowerInst(inst, index, next);

            if (lowering.hasError())
+            {
+                // Place labels for all blocks that we're skipping
+                // This is needed to avoid AssemblyBuilder assertions about jumps in earlier blocks with unplaced labels
+                for (size_t j = i + 1; j < sortedBlocks.size(); ++j)
+                {
+                    IrBlock& abandoned = function.blocks[sortedBlocks[j]];
+
+                    build.setLabel(abandoned.label);
+                }
+
                return false;
+            }
        }

        if (options.includeIr)
@ -223,12 +238,8 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
 [[maybe_unused]] static bool lowerIr(
    X64::AssemblyBuilderX64& build, IrBuilder& ir, NativeState& data, ModuleHelpers& helpers, Proto* proto, AssemblyOptions options)
 {
-    constexpr uint32_t kFunctionAlignment = 32;
-
    optimizeMemoryOperandsX64(ir.function);

-    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
-
    X64::IrLoweringX64 lowering(build, helpers, data, ir.function);

    return lowerImpl(build, lowering, ir.function, proto->bytecodeid, options);
@ -237,9 +248,6 @@ static bool lowerImpl(AssemblyBuilder& build, IrLowering& lowering, IrFunction&
 [[maybe_unused]] static bool lowerIr(
    A64::AssemblyBuilderA64& build, IrBuilder& ir, NativeState& data, ModuleHelpers& helpers, Proto* proto, AssemblyOptions options)
 {
-    if (!A64::IrLoweringA64::canLower(ir.function))
-        return false;
-
    A64::IrLoweringA64 lowering(build, helpers, data, proto, ir.function);

    return lowerImpl(build, lowering, ir.function, proto->bytecodeid, options);
@ -432,13 +440,13 @@ void create(lua_State* L)
    initHelperFunctions(data);

 #if defined(__x86_64__) || defined(_M_X64)
-    if (!X64::initEntryFunction(data))
+    if (!X64::initHeaderFunctions(data))
    {
        destroyNativeState(L);
        return;
    }
 #elif defined(__aarch64__)
-    if (!A64::initEntryFunction(data))
+    if (!A64::initHeaderFunctions(data))
    {
        destroyNativeState(L);
        return;
--- a/CodeGen/src/CodeGenA64.cpp
+++ b/CodeGen/src/CodeGenA64.cpp
@ -17,14 +17,107 @@ namespace CodeGen
 namespace A64
 {

-bool initEntryFunction(NativeState& data)
+struct EntryLocations
 {
-    AssemblyBuilderA64 build(/* logText= */ false);
-    UnwindBuilder& unwind = *data.unwindBuilder.get();
+    Label start;
+    Label prologueEnd;
+    Label epilogueStart;
+};
+
+static void emitExit(AssemblyBuilderA64& build, bool continueInVm)
+{
+    build.mov(x0, continueInVm);
+    build.ldr(x1, mem(rNativeContext, offsetof(NativeContext, gateExit)));
+    build.br(x1);
+}
+
+static void emitInterrupt(AssemblyBuilderA64& build)
+{
+    // x0 = pc offset
+    // x1 = return address in native code
+    // x2 = interrupt
+
+    // Stash return address in rBase; we need to reload rBase anyway
+    build.mov(rBase, x1);
+
+    // Update savedpc; required in case interrupt errors
+    build.add(x0, rCode, x0);
+    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
+    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
+
+    // Call interrupt
+    build.mov(x0, rState);
+    build.mov(w1, -1);
+    build.blr(x2);
+
+    // Check if we need to exit
+    Label skip;
+    build.ldrb(w0, mem(rState, offsetof(lua_State, status)));
+    build.cbz(w0, skip);
+
+    // L->ci->savedpc--
+    // note: recomputing this avoids having to stash x0
+    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
+    build.ldr(x0, mem(x1, offsetof(CallInfo, savedpc)));
+    build.sub(x0, x0, sizeof(Instruction));
+    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
+
+    emitExit(build, /* continueInVm */ false);
+
+    build.setLabel(skip);
+
+    // Return back to caller; rBase has stashed return address
+    build.mov(x0, rBase);
+
+    emitUpdateBase(build); // interrupt may have reallocated stack
+
+    build.br(x0);
+}
+
+static void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers)
+{
+    // x0 = closure object to reentry (equal to clvalue(L->ci->func))
+
+    // If the fallback requested an exit, we need to do this right away
+    build.cbz(x0, helpers.exitNoContinueVm);
+
+    emitUpdateBase(build);
+
+    // Need to update state of the current function before we jump away
+    build.ldr(x1, mem(x0, offsetof(Closure, l.p))); // cl->l.p aka proto
+
+    build.mov(rClosure, x0);
+    build.ldr(rConstants, mem(x1, offsetof(Proto, k))); // proto->k
+    build.ldr(rCode, mem(x1, offsetof(Proto, code)));   // proto->code
+
+    // Get instruction index from instruction pointer
+    // To get instruction index from instruction pointer, we need to divide byte offset by 4
+    // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
+    build.ldr(x2, mem(rState, offsetof(lua_State, ci))); // L->ci
+    build.ldr(x2, mem(x2, offsetof(CallInfo, savedpc))); // L->ci->savedpc
+    build.sub(x2, x2, rCode);
+    build.add(x2, x2, x2); // TODO: this would not be necessary if we supported shifted register offsets in loads
+
+    // We need to check if the new function can be executed natively
+    // TODO: This can be done earlier in the function flow, to reduce the JIT->VM transition penalty
+    build.ldr(x1, mem(x1, offsetofProtoExecData));
+    build.cbz(x1, helpers.exitContinueVm);
+
+    // Get new instruction location and jump to it
+    build.ldr(x1, mem(x1, offsetof(NativeProto, instTargets)));
+    build.ldr(x1, mem(x1, x2));
+    build.br(x1);
+}
+
+static EntryLocations buildEntryFunction(AssemblyBuilderA64& build, UnwindBuilder& unwind)
+{
+    EntryLocations locations;

    // Arguments: x0 = lua_State*, x1 = Proto*, x2 = native code pointer to jump to, x3 = NativeContext*

-    unwind.start();
+    locations.start = build.setLabel();
+    unwind.startFunction();
+
    unwind.allocStack(8); // TODO: this is just a hack to make UnwindBuilder assertions cooperate

    // prologue
@ -38,9 +131,7 @@ bool initEntryFunction(NativeState& data)

    build.mov(x29, sp); // this is only necessary if we maintain frame pointers, which we do in the JIT for now

-    unwind.finish();
-
-    size_t prologueSize = build.setLabel().location;
+    locations.prologueEnd = build.setLabel();

    // Setup native execution environment
    build.mov(rState, x0);
@ -58,7 +149,7 @@ bool initEntryFunction(NativeState& data)
    build.br(x2);

    // Even though we jumped away, we will return here in the end
-    Label returnOff = build.setLabel();
+    locations.epilogueStart = build.setLabel();

    // Cleanup and exit
    build.ldp(x23, x24, mem(sp, 48));
@ -69,12 +160,30 @@ bool initEntryFunction(NativeState& data)

    build.ret();

+    // Our entry function is special, it spans the whole remaining code area
+    unwind.finishFunction(build.getLabelOffset(locations.start), kFullBlockFuncton);
+
+    return locations;
+}
+
+bool initHeaderFunctions(NativeState& data)
+{
+    AssemblyBuilderA64 build(/* logText= */ false);
+    UnwindBuilder& unwind = *data.unwindBuilder.get();
+
+    unwind.startInfo();
+
+    EntryLocations entryLocations = buildEntryFunction(build, unwind);
+
    build.finalize();

+    unwind.finishInfo();
+
    LUAU_ASSERT(build.data.empty());

+    uint8_t* codeStart = nullptr;
    if (!data.codeAllocator.allocate(build.data.data(), int(build.data.size()), reinterpret_cast<const uint8_t*>(build.code.data()),
-            int(build.code.size() * sizeof(build.code[0])), data.gateData, data.gateDataSize, data.context.gateEntry))
+            int(build.code.size() * sizeof(build.code[0])), data.gateData, data.gateDataSize, codeStart))
    {
        LUAU_ASSERT(!"failed to create entry function");
        return false;
@ -82,9 +191,10 @@ bool initEntryFunction(NativeState& data)

    // Set the offset at the begining so that functions in new blocks will not overlay the locations
    // specified by the unwind information of the entry function
-    unwind.setBeginOffset(prologueSize);
+    unwind.setBeginOffset(build.getLabelOffset(entryLocations.prologueEnd));

-    data.context.gateExit = data.context.gateEntry + build.getLabelOffset(returnOff);
+    data.context.gateEntry = codeStart + build.getLabelOffset(entryLocations.start);
+    data.context.gateExit = codeStart + build.getLabelOffset(entryLocations.epilogueStart);

    return true;
 }
--- a/CodeGen/src/CodeGenA64.h
+++ b/CodeGen/src/CodeGenA64.h
@ -14,7 +14,7 @@ namespace A64

 class AssemblyBuilderA64;

-bool initEntryFunction(NativeState& data);
+bool initHeaderFunctions(NativeState& data);
 void assembleHelpers(AssemblyBuilderA64& build, ModuleHelpers& helpers);

 } // namespace A64
--- a/CodeGen/src/CodeGenUtils.cpp
+++ b/CodeGen/src/CodeGenUtils.cpp
@ -13,12 +13,58 @@ namespace Luau
 namespace CodeGen
 {

+bool forgLoopTableIter(lua_State* L, Table* h, int index, TValue* ra)
+{
+    int sizearray = h->sizearray;
+
+    // first we advance index through the array portion
+    while (unsigned(index) < unsigned(sizearray))
+    {
+        TValue* e = &h->array[index];
+
+        if (!ttisnil(e))
+        {
+            setpvalue(ra + 2, reinterpret_cast<void*>(uintptr_t(index + 1)));
+            setnvalue(ra + 3, double(index + 1));
+            setobj2s(L, ra + 4, e);
+
+            return true;
+        }
+
+        index++;
+    }
+
+    int sizenode = 1 << h->lsizenode;
+
+    // then we advance index through the hash portion
+    while (unsigned(index - h->sizearray) < unsigned(sizenode))
+    {
+        LuaNode* n = &h->node[index - sizearray];
+
+        if (!ttisnil(gval(n)))
+        {
+            setpvalue(ra + 2, reinterpret_cast<void*>(uintptr_t(index + 1)));
+            getnodekey(L, ra + 3, n);
+            setobj(L, ra + 4, gval(n));
+
+            return true;
+        }
+
+        index++;
+    }
+
+    return false;
+}
+
 bool forgLoopNodeIter(lua_State* L, Table* h, int index, TValue* ra)
 {
+    int sizearray = h->sizearray;
+    int sizenode = 1 << h->lsizenode;
+
    // then we advance index through the hash portion
-    while (unsigned(index - h->sizearray) < unsigned(1 << h->lsizenode))
+    while (unsigned(index - sizearray) < unsigned(sizenode))
    {
-        LuaNode* n = &h->node[index - h->sizearray];
+        LuaNode* n = &h->node[index - sizearray];

        if (!ttisnil(gval(n)))
        {
--- a/CodeGen/src/CodeGenUtils.h
+++ b/CodeGen/src/CodeGenUtils.h
@ -8,6 +8,7 @@ namespace Luau
 namespace CodeGen
 {

+bool forgLoopTableIter(lua_State* L, Table* h, int index, TValue* ra);
 bool forgLoopNodeIter(lua_State* L, Table* h, int index, TValue* ra);
 bool forgLoopNonTableFallback(lua_State* L, int insnA, int aux);

--- a/CodeGen/src/CodeGenX64.cpp
+++ b/CodeGen/src/CodeGenX64.cpp
@ -41,12 +41,21 @@ namespace CodeGen
 namespace X64
 {

-bool initEntryFunction(NativeState& data)
+struct EntryLocations
 {
-    AssemblyBuilderX64 build(/* logText= */ false);
-    UnwindBuilder& unwind = *data.unwindBuilder.get();
+    Label start;
+    Label prologueEnd;
+    Label epilogueStart;
+};

-    unwind.start();
+static EntryLocations buildEntryFunction(AssemblyBuilderX64& build, UnwindBuilder& unwind)
+{
+    EntryLocations locations;
+
+    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
+
+    locations.start = build.setLabel();
+    unwind.startFunction();

    // Save common non-volatile registers
    build.push(rbp);
@ -84,9 +93,7 @@ bool initEntryFunction(NativeState& data)
    build.sub(rsp, kStackSize + kLocalsSize);
    unwind.allocStack(kStackSize + kLocalsSize);

-    unwind.finish();
-
-    size_t prologueSize = build.setLabel().location;
+    locations.prologueEnd = build.setLabel();

    // Setup native execution environment
    build.mov(rState, rArg1);
@ -104,7 +111,7 @@ bool initEntryFunction(NativeState& data)
    build.jmp(rArg3);

    // Even though we jumped away, we will return here in the end
-    Label returnOff = build.setLabel();
+    locations.epilogueStart = build.setLabel();

    // Cleanup and exit
    build.add(rsp, kStackSize + kLocalsSize);
@ -123,12 +130,30 @@ bool initEntryFunction(NativeState& data)
    build.pop(rbp);
    build.ret();

+    // Our entry function is special, it spans the whole remaining code area
+    unwind.finishFunction(build.getLabelOffset(locations.start), kFullBlockFuncton);
+
+    return locations;
+}
+
+bool initHeaderFunctions(NativeState& data)
+{
+    AssemblyBuilderX64 build(/* logText= */ false);
+    UnwindBuilder& unwind = *data.unwindBuilder.get();
+
+    unwind.startInfo();
+
+    EntryLocations entryLocations = buildEntryFunction(build, unwind);
+
    build.finalize();

+    unwind.finishInfo();
+
    LUAU_ASSERT(build.data.empty());

-    if (!data.codeAllocator.allocate(build.data.data(), int(build.data.size()), build.code.data(), int(build.code.size()), data.gateData,
-            data.gateDataSize, data.context.gateEntry))
+    uint8_t* codeStart = nullptr;
+    if (!data.codeAllocator.allocate(
+            build.data.data(), int(build.data.size()), build.code.data(), int(build.code.size()), data.gateData, data.gateDataSize, codeStart))
    {
        LUAU_ASSERT(!"failed to create entry function");
        return false;
@ -136,9 +161,10 @@ bool initEntryFunction(NativeState& data)

    // Set the offset at the begining so that functions in new blocks will not overlay the locations
    // specified by the unwind information of the entry function
-    unwind.setBeginOffset(prologueSize);
+    unwind.setBeginOffset(build.getLabelOffset(entryLocations.prologueEnd));

-    data.context.gateExit = data.context.gateEntry + returnOff.location;
+    data.context.gateEntry = codeStart + build.getLabelOffset(entryLocations.start);
+    data.context.gateExit = codeStart + build.getLabelOffset(entryLocations.epilogueStart);

    return true;
 }
--- a/CodeGen/src/CodeGenX64.h
+++ b/CodeGen/src/CodeGenX64.h
@ -14,7 +14,7 @@ namespace X64

 class AssemblyBuilderX64;

-bool initEntryFunction(NativeState& data);
+bool initHeaderFunctions(NativeState& data);
 void assembleHelpers(AssemblyBuilderX64& build, ModuleHelpers& helpers);

 } // namespace X64
--- a/CodeGen/src/EmitBuiltinsX64.cpp
+++ b/CodeGen/src/EmitBuiltinsX64.cpp
@ -107,47 +107,11 @@ void emitBuiltinMathLog(IrRegAllocX64& regs, AssemblyBuilderX64& build, int npar
    regs.assertAllFree();
    build.vmovsd(xmm0, luauRegValue(arg));

-    if (nparams == 1)
-    {
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-    }
-    else
-    {
-        Label log10check, logdivlog, exit;
-
-        // Using 'rbx' for non-volatile temporary storage of log(arg1) result
-        RegisterX64 tmp = rbx;
-        OperandX64 arg2value = qword[args + offsetof(TValue, value)];
-
-        build.vmovsd(xmm1, arg2value);
-
-        jumpOnNumberCmp(build, noreg, build.f64(2.0), xmm1, IrCondition::NotEqual, log10check);
-
+    // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+    if (nparams == 2)
        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log2)]);
-        build.jmp(exit);
-
-        build.setLabel(log10check);
-        jumpOnNumberCmp(build, noreg, build.f64(10.0), xmm1, IrCondition::NotEqual, logdivlog);
-
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log10)]);
-        build.jmp(exit);
-
-        build.setLabel(logdivlog);
-
-        // log(arg1)
+    else
        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-        build.vmovq(tmp, xmm0);
-
-        // log(arg2)
-        build.vmovsd(xmm0, arg2value);
-        build.call(qword[rNativeContext + offsetof(NativeContext, libm_log)]);
-
-        // log(arg1) / log(arg2)
-        build.vmovq(xmm1, tmp);
-        build.vdivsd(xmm0, xmm1, xmm0);
-
-        build.setLabel(exit);
-    }

    build.vmovsd(luauRegValue(ra), xmm0);
 }
@ -256,62 +220,68 @@ void emitBuiltin(IrRegAllocX64& regs, AssemblyBuilderX64& build, int bfid, int r

    switch (bfid)
    {
-    case LBF_ASSERT:
-    case LBF_MATH_DEG:
-    case LBF_MATH_RAD:
-    case LBF_MATH_MIN:
-    case LBF_MATH_MAX:
-    case LBF_MATH_CLAMP:
-    case LBF_MATH_FLOOR:
-    case LBF_MATH_CEIL:
-    case LBF_MATH_SQRT:
-    case LBF_MATH_POW:
-    case LBF_MATH_ABS:
-    case LBF_MATH_ROUND:
-        // These instructions are fully translated to IR
-        break;
    case LBF_MATH_EXP:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathExp(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_FMOD:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
        return emitBuiltinMathFmod(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_ASIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathAsin(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_SIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathSin(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_SINH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathSinh(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_ACOS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathAcos(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_COS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathCos(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_COSH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathCosh(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_ATAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathAtan(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_TAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathTan(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_TANH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathTanh(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_ATAN2:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
        return emitBuiltinMathAtan2(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_LOG10:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathLog10(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_LOG:
+        LUAU_ASSERT((nparams == 1 || nparams == 2) && nresults == 1);
        return emitBuiltinMathLog(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_LDEXP:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
        return emitBuiltinMathLdexp(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_FREXP:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
        return emitBuiltinMathFrexp(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_MODF:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
        return emitBuiltinMathModf(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_MATH_SIGN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinMathSign(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_TYPE:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinType(regs, build, nparams, ra, arg, argsOp, nresults);
    case LBF_TYPEOF:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
        return emitBuiltinTypeof(regs, build, nparams, ra, arg, argsOp, nresults);
    default:
-        LUAU_ASSERT(!"missing x64 lowering");
+        LUAU_ASSERT(!"Missing x64 lowering");
        break;
    }
 }
--- a/CodeGen/src/EmitCommon.h
+++ b/CodeGen/src/EmitCommon.h
@ -13,8 +13,8 @@ constexpr unsigned kLuaNodeSizeLog2 = 5;
 constexpr unsigned kLuaNodeTagMask = 0xf;
 constexpr unsigned kNextBitOffset = 4;

-constexpr unsigned kOffsetOfLuaNodeTag = 12;  // offsetof cannot be used on a bit field
-constexpr unsigned kOffsetOfLuaNodeNext = 12; // offsetof cannot be used on a bit field
+constexpr unsigned kOffsetOfTKeyTag = 12;  // offsetof cannot be used on a bit field
+constexpr unsigned kOffsetOfTKeyNext = 12; // offsetof cannot be used on a bit field
 constexpr unsigned kOffsetOfInstructionC = 3;

 // Leaf functions that are placed in every module to perform common instruction sequences
--- a/CodeGen/src/EmitCommonA64.cpp
+++ b/CodeGen/src/EmitCommonA64.cpp
@ -1,130 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#include "EmitCommonA64.h"
-
-#include "NativeState.h"
-#include "CustomExecUtils.h"
-
-namespace Luau
-{
-namespace CodeGen
-{
-namespace A64
-{
-
-void emitUpdateBase(AssemblyBuilderA64& build)
-{
-    build.ldr(rBase, mem(rState, offsetof(lua_State, base)));
-}
-
-void emitExit(AssemblyBuilderA64& build, bool continueInVm)
-{
-    build.mov(x0, continueInVm);
-    build.ldr(x1, mem(rNativeContext, offsetof(NativeContext, gateExit)));
-    build.br(x1);
-}
-
-void emitInterrupt(AssemblyBuilderA64& build)
-{
-    // x0 = pc offset
-    // x1 = return address in native code
-    // x2 = interrupt
-
-    // Stash return address in rBase; we need to reload rBase anyway
-    build.mov(rBase, x1);
-
-    // Update savedpc; required in case interrupt errors
-    build.add(x0, rCode, x0);
-    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
-    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
-
-    // Call interrupt
-    build.mov(x0, rState);
-    build.mov(w1, -1);
-    build.blr(x2);
-
-    // Check if we need to exit
-    Label skip;
-    build.ldrb(w0, mem(rState, offsetof(lua_State, status)));
-    build.cbz(w0, skip);
-
-    // L->ci->savedpc--
-    // note: recomputing this avoids having to stash x0
-    build.ldr(x1, mem(rState, offsetof(lua_State, ci)));
-    build.ldr(x0, mem(x1, offsetof(CallInfo, savedpc)));
-    build.sub(x0, x0, sizeof(Instruction));
-    build.str(x0, mem(x1, offsetof(CallInfo, savedpc)));
-
-    emitExit(build, /* continueInVm */ false);
-
-    build.setLabel(skip);
-
-    // Return back to caller; rBase has stashed return address
-    build.mov(x0, rBase);
-
-    emitUpdateBase(build); // interrupt may have reallocated stack
-
-    build.br(x0);
-}
-
-void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers)
-{
-    // x0 = closure object to reentry (equal to clvalue(L->ci->func))
-
-    // If the fallback requested an exit, we need to do this right away
-    build.cbz(x0, helpers.exitNoContinueVm);
-
-    emitUpdateBase(build);
-
-    // Need to update state of the current function before we jump away
-    build.ldr(x1, mem(x0, offsetof(Closure, l.p))); // cl->l.p aka proto
-
-    build.mov(rClosure, x0);
-    build.ldr(rConstants, mem(x1, offsetof(Proto, k))); // proto->k
-    build.ldr(rCode, mem(x1, offsetof(Proto, code)));   // proto->code
-
-    // Get instruction index from instruction pointer
-    // To get instruction index from instruction pointer, we need to divide byte offset by 4
-    // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
-    build.ldr(x2, mem(rState, offsetof(lua_State, ci))); // L->ci
-    build.ldr(x2, mem(x2, offsetof(CallInfo, savedpc))); // L->ci->savedpc
-    build.sub(x2, x2, rCode);
-    build.add(x2, x2, x2); // TODO: this would not be necessary if we supported shifted register offsets in loads
-
-    // We need to check if the new function can be executed natively
-    // TODO: This can be done earlier in the function flow, to reduce the JIT->VM transition penalty
-    build.ldr(x1, mem(x1, offsetofProtoExecData));
-    build.cbz(x1, helpers.exitContinueVm);
-
-    // Get new instruction location and jump to it
-    build.ldr(x1, mem(x1, offsetof(NativeProto, instTargets)));
-    build.ldr(x1, mem(x1, x2));
-    build.br(x1);
-}
-
-void emitFallback(AssemblyBuilderA64& build, int op, int pcpos)
-{
-    // fallback(L, instruction, base, k)
-    build.mov(x0, rState);
-
-    // TODO: refactor into a common helper
-    if (pcpos * sizeof(Instruction) <= AssemblyBuilderA64::kMaxImmediate)
-    {
-        build.add(x1, rCode, uint16_t(pcpos * sizeof(Instruction)));
-    }
-    else
-    {
-        build.mov(x1, pcpos * sizeof(Instruction));
-        build.add(x1, rCode, x1);
-    }
-
-    build.mov(x2, rBase);
-    build.mov(x3, rConstants);
-    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, fallback) + op * sizeof(NativeFallback) + offsetof(NativeFallback, fallback)));
-    build.blr(x4);
-
-    emitUpdateBase(build);
-}
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
--- a/CodeGen/src/EmitCommonA64.h
+++ b/CodeGen/src/EmitCommonA64.h
@ -7,6 +7,7 @@

 #include "lobject.h"
 #include "ltm.h"
+#include "lstate.h"

 // AArch64 ABI reminder:
 // Arguments: x0-x7, v0-v7
@ -38,15 +39,19 @@ constexpr RegisterA64 rBase = x24;      // StkId base

 // Native code is as stackless as the interpreter, so we can place some data on the stack once and have it accessible at any point
 // See CodeGenA64.cpp for layout
-constexpr unsigned kStackSize = 64; // 8 stashed registers
+constexpr unsigned kStashSlots = 8; // stashed non-volatile registers
+constexpr unsigned kSpillSlots = 0; // slots for spilling temporary registers (unused)
+constexpr unsigned kTempSlots = 2;  // 16 bytes of temporary space, such luxury!

-void emitUpdateBase(AssemblyBuilderA64& build);
+constexpr unsigned kStackSize = (kStashSlots + kSpillSlots + kTempSlots) * 8;

-// TODO: Move these to CodeGenA64 so that they can't be accidentally called during lowering
-void emitExit(AssemblyBuilderA64& build, bool continueInVm);
-void emitInterrupt(AssemblyBuilderA64& build);
-void emitReentry(AssemblyBuilderA64& build, ModuleHelpers& helpers);
-void emitFallback(AssemblyBuilderA64& build, int op, int pcpos);
+constexpr AddressA64 sSpillArea = mem(sp, kStashSlots * 8);
+constexpr AddressA64 sTemporary = mem(sp, (kStashSlots + kSpillSlots) * 8);
+
+inline void emitUpdateBase(AssemblyBuilderA64& build)
+{
+    build.ldr(rBase, mem(rState, offsetof(lua_State, base)));
+}

 } // namespace A64
 } // namespace CodeGen
--- a/CodeGen/src/EmitCommonX64.cpp
+++ b/CodeGen/src/EmitCommonX64.cpp
@ -279,32 +279,37 @@ void emitUpdateBase(AssemblyBuilderX64& build)
    build.mov(rBase, qword[rState + offsetof(lua_State, base)]);
 }

-// Note: only uses rax/rdx, the caller may use other registers
-static void emitSetSavedPc(AssemblyBuilderX64& build, int pcpos)
+static void emitSetSavedPc(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos)
 {
-    build.mov(rdx, sCode);
-    build.add(rdx, pcpos * sizeof(Instruction));
-    build.mov(rax, qword[rState + offsetof(lua_State, ci)]);
-    build.mov(qword[rax + offsetof(CallInfo, savedpc)], rdx);
+    ScopedRegX64 tmp1{regs, SizeX64::qword};
+    ScopedRegX64 tmp2{regs, SizeX64::qword};
+
+    build.mov(tmp1.reg, sCode);
+    build.add(tmp1.reg, pcpos * sizeof(Instruction));
+    build.mov(tmp2.reg, qword[rState + offsetof(lua_State, ci)]);
+    build.mov(qword[tmp2.reg + offsetof(CallInfo, savedpc)], tmp1.reg);
 }

-void emitInterrupt(AssemblyBuilderX64& build, int pcpos)
+void emitInterrupt(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos)
 {
    Label skip;

+    ScopedRegX64 tmp{regs, SizeX64::qword};
+
    // Skip if there is no interrupt set
-    build.mov(r8, qword[rState + offsetof(lua_State, global)]);
-    build.mov(r8, qword[r8 + offsetof(global_State, cb.interrupt)]);
-    build.test(r8, r8);
+    build.mov(tmp.reg, qword[rState + offsetof(lua_State, global)]);
+    build.mov(tmp.reg, qword[tmp.reg + offsetof(global_State, cb.interrupt)]);
+    build.test(tmp.reg, tmp.reg);
    build.jcc(ConditionX64::Zero, skip);

-    emitSetSavedPc(build, pcpos + 1); // uses rax/rdx
+    emitSetSavedPc(regs, build, pcpos + 1);

    // Call interrupt
    // TODO: This code should move to the end of the function, or even be outlined so that it can be shared by multiple interruptible instructions
-    build.mov(rArg1, rState);
-    build.mov(dwordReg(rArg2), -1); // function accepts 'int' here and using qword reg would've forced 8 byte constant here
-    build.call(r8);
+    IrCallWrapperX64 callWrap(regs, build);
+    callWrap.addArgument(SizeX64::qword, rState);
+    callWrap.addArgument(SizeX64::dword, -1);
+    callWrap.call(tmp.release());

    emitUpdateBase(build); // interrupt may have reallocated stack

@ -320,41 +325,23 @@ void emitInterrupt(AssemblyBuilderX64& build, int pcpos)
    build.setLabel(skip);
 }

-void emitFallback(AssemblyBuilderX64& build, NativeState& data, int op, int pcpos)
+void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, NativeState& data, int op, int pcpos)
 {
-    NativeFallback& opinfo = data.context.fallback[op];
-    LUAU_ASSERT(opinfo.fallback);
-
-    if (build.logText)
-        build.logAppend("; fallback\n");
+    LUAU_ASSERT(data.context.fallback[op]);

    // fallback(L, instruction, base, k)
-    build.mov(rArg1, rState);
-    build.mov(rArg2, sCode);
-    build.add(rArg2, pcpos * sizeof(Instruction));
-    build.mov(rArg3, rBase);
-    build.mov(rArg4, rConstants);
-    build.call(qword[rNativeContext + offsetof(NativeContext, fallback) + op * sizeof(NativeFallback) + offsetof(NativeFallback, fallback)]);
+    IrCallWrapperX64 callWrap(regs, build);
+    callWrap.addArgument(SizeX64::qword, rState);
+
+    RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
+    build.mov(reg, sCode);
+    callWrap.addArgument(SizeX64::qword, addr[reg + pcpos * sizeof(Instruction)]);
+
+    callWrap.addArgument(SizeX64::qword, rBase);
+    callWrap.addArgument(SizeX64::qword, rConstants);
+    callWrap.call(qword[rNativeContext + offsetof(NativeContext, fallback) + op * sizeof(FallbackFn)]);

    emitUpdateBase(build);
-
-    // Some instructions may jump to a different instruction or a completely different function
-    if (opinfo.flags & kFallbackUpdatePc)
-    {
-        build.mov(rcx, sClosure);
-        build.mov(rcx, qword[rcx + offsetof(Closure, l.p)]);
-
-        // Get instruction index from returned instruction pointer
-        // To get instruction index from instruction pointer, we need to divide byte offset by 4
-        // But we will actually need to scale instruction index by 8 back to byte offset later so it cancels out
-        build.sub(rax, sCode);
-
-        build.mov(rdx, qword[rcx + offsetofProtoExecData]);
-
-        // Get new instruction location and jump to it
-        build.mov(rcx, qword[rdx + offsetof(NativeProto, instTargets)]);
-        build.jmp(qword[rax * 2 + rcx]);
-    }
 }

 void emitContinueCallInVm(AssemblyBuilderX64& build)
--- a/CodeGen/src/EmitCommonX64.h
+++ b/CodeGen/src/EmitCommonX64.h
@ -34,6 +34,8 @@ namespace X64

 struct IrRegAllocX64;

+constexpr uint32_t kFunctionAlignment = 32;
+
 // Data that is very common to access is placed in non-volatile registers
 constexpr RegisterX64 rState = r15;         // lua_State* L
 constexpr RegisterX64 rBase = r14;          // StkId base
@ -134,7 +136,7 @@ inline OperandX64 luauNodeKeyValue(RegisterX64 node)
 // Note: tag has dirty upper bits
 inline OperandX64 luauNodeKeyTag(RegisterX64 node)
 {
-    return dword[node + offsetof(LuaNode, key) + kOffsetOfLuaNodeTag];
+    return dword[node + offsetof(LuaNode, key) + kOffsetOfTKeyTag];
 }

 inline OperandX64 luauNodeValue(RegisterX64 node)
@ -162,12 +164,6 @@ inline void jumpIfTagIsNot(AssemblyBuilderX64& build, int ri, lua_Type tag, Labe
    build.jcc(ConditionX64::NotEqual, label);
 }

-inline void jumpIfTagIsNot(AssemblyBuilderX64& build, RegisterX64 reg, lua_Type tag, Label& label)
-{
-    build.cmp(dword[reg + offsetof(TValue, tt)], tag);
-    build.jcc(ConditionX64::NotEqual, label);
-}
-
 // Note: fallthrough label should be placed after this condition
 inline void jumpIfFalsy(AssemblyBuilderX64& build, int ri, Label& target, Label& fallthrough)
 {
@ -188,26 +184,6 @@ inline void jumpIfTruthy(AssemblyBuilderX64& build, int ri, Label& target, Label
    build.jcc(ConditionX64::NotEqual, target); // true if boolean value is 'true'
 }

-inline void jumpIfMetatablePresent(AssemblyBuilderX64& build, RegisterX64 table, Label& target)
-{
-    build.cmp(qword[table + offsetof(Table, metatable)], 0);
-    build.jcc(ConditionX64::NotEqual, target);
-}
-
-inline void jumpIfUnsafeEnv(AssemblyBuilderX64& build, RegisterX64 tmp, Label& label)
-{
-    build.mov(tmp, sClosure);
-    build.mov(tmp, qword[tmp + offsetof(Closure, env)]);
-    build.test(byte[tmp + offsetof(Table, safeenv)], 1);
-    build.jcc(ConditionX64::Zero, label); // Not a safe environment
-}
-
-inline void jumpIfTableIsReadOnly(AssemblyBuilderX64& build, RegisterX64 table, Label& label)
-{
-    build.cmp(byte[table + offsetof(Table, readonly)], 0);
-    build.jcc(ConditionX64::NotEqual, label);
-}
-
 inline void jumpIfNodeKeyTagIsNot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, lua_Type tag, Label& label)
 {
    tmp.size = SizeX64::dword;
@ -224,13 +200,6 @@ inline void jumpIfNodeValueTagIs(AssemblyBuilderX64& build, RegisterX64 node, lu
    build.jcc(ConditionX64::Equal, label);
 }

-inline void jumpIfNodeHasNext(AssemblyBuilderX64& build, RegisterX64 node, Label& label)
-{
-    build.mov(ecx, dword[node + offsetof(LuaNode, key) + kOffsetOfLuaNodeNext]);
-    build.shr(ecx, kNextBitOffset);
-    build.jcc(ConditionX64::NotZero, label);
-}
-
 inline void jumpIfNodeKeyNotInExpectedSlot(AssemblyBuilderX64& build, RegisterX64 tmp, RegisterX64 node, OperandX64 expectedKey, Label& label)
 {
    jumpIfNodeKeyTagIsNot(build, tmp, node, LUA_TSTRING, label);
@ -260,8 +229,8 @@ void callStepGc(IrRegAllocX64& regs, AssemblyBuilderX64& build);

 void emitExit(AssemblyBuilderX64& build, bool continueInVm);
 void emitUpdateBase(AssemblyBuilderX64& build);
-void emitInterrupt(AssemblyBuilderX64& build, int pcpos);
-void emitFallback(AssemblyBuilderX64& build, NativeState& data, int op, int pcpos);
+void emitInterrupt(IrRegAllocX64& regs, AssemblyBuilderX64& build, int pcpos);
+void emitFallback(IrRegAllocX64& regs, AssemblyBuilderX64& build, NativeState& data, int op, int pcpos);

 void emitContinueCallInVm(AssemblyBuilderX64& build);

--- a/CodeGen/src/EmitInstructionA64.cpp
+++ b/CodeGen/src/EmitInstructionA64.cpp
@ -1,74 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#include "EmitInstructionA64.h"
-
-#include "Luau/AssemblyBuilderA64.h"
-
-#include "EmitCommonA64.h"
-#include "NativeState.h"
-#include "CustomExecUtils.h"
-
-namespace Luau
-{
-namespace CodeGen
-{
-namespace A64
-{
-
-void emitInstReturn(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int n)
-{
-    // callFallback(L, ra, n)
-    build.mov(x0, rState);
-    build.add(x1, rBase, uint16_t(ra * sizeof(TValue)));
-    build.mov(w2, n);
-    build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, returnFallback)));
-    build.blr(x3);
-
-    // reentry with x0=closure (NULL will trigger exit)
-    build.b(helpers.reentry);
-}
-
-void emitInstCall(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults)
-{
-    // argtop = (nparams == LUA_MULTRET) ? L->top : ra + 1 + nparams;
-    if (nparams == LUA_MULTRET)
-        build.ldr(x2, mem(rState, offsetof(lua_State, top)));
-    else
-        build.add(x2, rBase, uint16_t((ra + 1 + nparams) * sizeof(TValue)));
-
-    // callFallback(L, ra, argtop, nresults)
-    build.mov(x0, rState);
-    build.add(x1, rBase, uint16_t(ra * sizeof(TValue)));
-    build.mov(w3, nresults);
-    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, callFallback)));
-    build.blr(x4);
-
-    // reentry with x0=closure (NULL will trigger exit)
-    build.b(helpers.reentry);
-}
-
-void emitInstGetImport(AssemblyBuilderA64& build, int ra, uint32_t aux)
-{
-    // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
-    build.mov(x0, rState);
-    build.ldr(x1, mem(rClosure, offsetof(Closure, env)));
-    build.mov(x2, rConstants);
-    build.mov(w3, aux);
-    build.mov(w4, 0);
-    build.ldr(x5, mem(rNativeContext, offsetof(NativeContext, luaV_getimport)));
-    build.blr(x5);
-
-    emitUpdateBase(build);
-
-    // setobj2s(L, ra, L->top - 1)
-    build.ldr(x0, mem(rState, offsetof(lua_State, top)));
-    build.sub(x0, x0, sizeof(TValue));
-    build.ldr(q0, x0);
-    build.str(q0, mem(rBase, ra * sizeof(TValue)));
-
-    // L->top--
-    build.str(x0, mem(rState, offsetof(lua_State, top)));
-}
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
--- a/CodeGen/src/EmitInstructionA64.h
+++ b/CodeGen/src/EmitInstructionA64.h
@ -1,24 +0,0 @@
-// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
-#pragma once
-
-#include <stdint.h>
-
-namespace Luau
-{
-namespace CodeGen
-{
-
-struct ModuleHelpers;
-
-namespace A64
-{
-
-class AssemblyBuilderA64;
-
-void emitInstReturn(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int n);
-void emitInstCall(AssemblyBuilderA64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults);
-void emitInstGetImport(AssemblyBuilderA64& build, int ra, uint32_t aux);
-
-} // namespace A64
-} // namespace CodeGen
-} // namespace Luau
--- a/CodeGen/src/EmitInstructionX64.cpp
+++ b/CodeGen/src/EmitInstructionX64.cpp
@ -415,7 +415,7 @@ void emitInstSetList(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int
    callBarrierTableFast(regs, build, table, {});
 }

-void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat, Label& loopExit)
+void emitInstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat)
 {
    // ipairs-style traversal is handled in IR
    LUAU_ASSERT(aux >= 0);
@ -484,78 +484,6 @@ void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRep
    build.jcc(ConditionX64::NotZero, loopRepeat);
 }

-void emitinstForGLoopFallback(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat)
-{
-    build.mov(rArg1, rState);
-    build.mov(dwordReg(rArg2), ra);
-    build.mov(dwordReg(rArg3), aux);
-    build.call(qword[rNativeContext + offsetof(NativeContext, forgLoopNonTableFallback)]);
-    emitUpdateBase(build);
-    build.test(al, al);
-    build.jcc(ConditionX64::NotZero, loopRepeat);
-}
-
-void emitInstForGPrepXnextFallback(AssemblyBuilderX64& build, int pcpos, int ra, Label& target)
-{
-    build.mov(rArg1, rState);
-    build.lea(rArg2, luauRegAddress(ra));
-    build.mov(dwordReg(rArg3), pcpos + 1);
-    build.call(qword[rNativeContext + offsetof(NativeContext, forgPrepXnextFallback)]);
-    build.jmp(target);
-}
-
-void emitInstGetImportFallback(AssemblyBuilderX64& build, int ra, uint32_t aux)
-{
-    build.mov(rax, sClosure);
-
-    // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
-    build.mov(rArg1, rState);
-    build.mov(rArg2, qword[rax + offsetof(Closure, env)]);
-    build.mov(rArg3, rConstants);
-    build.mov(dwordReg(rArg4), aux);
-
-    if (build.abi == ABIX64::Windows)
-        build.mov(sArg5, 0);
-    else
-        build.xor_(rArg5, rArg5);
-
-    build.call(qword[rNativeContext + offsetof(NativeContext, luaV_getimport)]);
-
-    emitUpdateBase(build);
-
-    // setobj2s(L, ra, L->top - 1)
-    build.mov(rax, qword[rState + offsetof(lua_State, top)]);
-    build.sub(rax, sizeof(TValue));
-    build.vmovups(xmm0, xmmword[rax]);
-    build.vmovups(luauReg(ra), xmm0);
-
-    // L->top--
-    build.mov(qword[rState + offsetof(lua_State, top)], rax);
-}
-
-void emitInstCoverage(AssemblyBuilderX64& build, int pcpos)
-{
-    build.mov(rcx, sCode);
-    build.add(rcx, pcpos * sizeof(Instruction));
-
-    // hits = LUAU_INSN_E(*pc)
-    build.mov(edx, dword[rcx]);
-    build.sar(edx, 8);
-
-    // hits = (hits < (1 << 23) - 1) ? hits + 1 : hits;
-    build.xor_(eax, eax);
-    build.cmp(edx, (1 << 23) - 1);
-    build.setcc(ConditionX64::NotEqual, al);
-    build.add(edx, eax);
-
-
-    // VM_PATCH_E(pc, hits);
-    build.sal(edx, 8);
-    build.movzx(eax, byte[rcx]);
-    build.or_(eax, edx);
-    build.mov(dword[rcx], eax);
-}
-
 } // namespace X64
 } // namespace CodeGen
 } // namespace Luau
--- a/CodeGen/src/EmitInstructionX64.h
+++ b/CodeGen/src/EmitInstructionX64.h
@ -20,11 +20,7 @@ struct IrRegAllocX64;
 void emitInstCall(AssemblyBuilderX64& build, ModuleHelpers& helpers, int ra, int nparams, int nresults);
 void emitInstReturn(AssemblyBuilderX64& build, ModuleHelpers& helpers, int ra, int actualResults);
 void emitInstSetList(IrRegAllocX64& regs, AssemblyBuilderX64& build, int ra, int rb, int count, uint32_t index);
-void emitinstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat, Label& loopExit);
-void emitinstForGLoopFallback(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat);
-void emitInstForGPrepXnextFallback(AssemblyBuilderX64& build, int pcpos, int ra, Label& target);
-void emitInstGetImportFallback(AssemblyBuilderX64& build, int ra, uint32_t aux);
-void emitInstCoverage(AssemblyBuilderX64& build, int pcpos);
+void emitInstForGLoop(AssemblyBuilderX64& build, int ra, int aux, Label& loopRepeat);

 } // namespace X64
 } // namespace CodeGen
--- a/CodeGen/src/Fallbacks.cpp
+++ b/CodeGen/src/Fallbacks.cpp
@ -416,6 +416,44 @@ const Instruction* execute_LOP_NAMECALL(lua_State* L, const Instruction* pc, Stk
    return pc;
 }

+const Instruction* execute_LOP_SETLIST(lua_State* L, const Instruction* pc, StkId base, TValue* k)
+{
+    [[maybe_unused]] Closure* cl = clvalue(L->ci->func);
+    Instruction insn = *pc++;
+    StkId ra = VM_REG(LUAU_INSN_A(insn));
+    StkId rb = &base[LUAU_INSN_B(insn)]; // note: this can point to L->top if c == LUA_MULTRET making VM_REG unsafe to use
+    int c = LUAU_INSN_C(insn) - 1;
+    uint32_t index = *pc++;
+
+    if (c == LUA_MULTRET)
+    {
+        c = int(L->top - rb);
+        L->top = L->ci->top;
+    }
+
+    Table* h = hvalue(ra);
+
+    // TODO: we really don't need this anymore
+    if (!ttistable(ra))
+        return NULL; // temporary workaround to weaken a rather powerful exploitation primitive in case of a MITM attack on bytecode
+
+    int last = index + c - 1;
+    if (last > h->sizearray)
+    {
+        VM_PROTECT_PC(); // luaH_resizearray may fail due to OOM
+
+        luaH_resizearray(L, h, last);
+    }
+
+    TValue* array = h->array;
+
+    for (int i = 0; i < c; ++i)
+        setobj2t(L, &array[index + i - 1], rb + i);
+
+    luaC_barrierfast(L, h);
+    return pc;
+}
+
 const Instruction* execute_LOP_FORGPREP(lua_State* L, const Instruction* pc, StkId base, TValue* k)
 {
    [[maybe_unused]] Closure* cl = clvalue(L->ci->func);
--- a/CodeGen/src/Fallbacks.h
+++ b/CodeGen/src/Fallbacks.h
@ -16,6 +16,7 @@ const Instruction* execute_LOP_GETTABLEKS(lua_State* L, const Instruction* pc, S
 const Instruction* execute_LOP_SETTABLEKS(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_NEWCLOSURE(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_NAMECALL(lua_State* L, const Instruction* pc, StkId base, TValue* k);
+const Instruction* execute_LOP_SETLIST(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_FORGPREP(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_GETVARARGS(lua_State* L, const Instruction* pc, StkId base, TValue* k);
 const Instruction* execute_LOP_DUPCLOSURE(lua_State* L, const Instruction* pc, StkId base, TValue* k);
--- a/CodeGen/src/IrAnalysis.cpp
+++ b/CodeGen/src/IrAnalysis.cpp
@ -354,6 +354,8 @@ static RegisterSet computeBlockLiveInRegSet(IrFunction& function, const IrBlock&
        case IrCmd::RETURN:
            useRange(vmRegOp(inst.a), function.intOp(inst.b));
            break;
+
+            // TODO: FASTCALL is more restrictive than INVOKE_FASTCALL; we should either determine the exact semantics, or rework it
        case IrCmd::FASTCALL:
        case IrCmd::INVOKE_FASTCALL:
            if (int count = function.intOp(inst.e); count != -1)
--- a/CodeGen/src/IrBuilder.cpp
+++ b/CodeGen/src/IrBuilder.cpp
@ -468,7 +468,8 @@ void IrBuilder::clone(const IrBlock& source, bool removeCurrentTerminator)
        IrInst clone = function.instructions[index];

        // Skip pseudo instructions to make clone more compact, but validate that they have no users
-        if (isPseudo(clone.cmd))
+        // But if substitution tracks a location, that tracking has to be preserved
+        if (isPseudo(clone.cmd) && !(clone.cmd == IrCmd::SUBSTITUTE && clone.b.kind != IrOpKind::None))
        {
            LUAU_ASSERT(clone.useCount == 0);
            continue;
--- a/CodeGen/src/IrCallWrapperX64.cpp
+++ b/CodeGen/src/IrCallWrapperX64.cpp
@ -13,6 +13,10 @@ namespace CodeGen
 namespace X64
 {

+static const std::array<OperandX64, 6> kWindowsGprOrder = {rcx, rdx, r8, r9, addr[rsp + 32], addr[rsp + 40]};
+static const std::array<OperandX64, 6> kSystemvGprOrder = {rdi, rsi, rdx, rcx, r8, r9};
+static const std::array<OperandX64, 4> kXmmOrder = {xmm0, xmm1, xmm2, xmm3}; // Common order for first 4 fp arguments on Windows/SystemV
+
 static bool sameUnderlyingRegister(RegisterX64 a, RegisterX64 b)
 {
    SizeX64 underlyingSizeA = a.size == SizeX64::xmmword ? SizeX64::xmmword : SizeX64::qword;
@ -37,21 +41,35 @@ void IrCallWrapperX64::addArgument(SizeX64 targetSize, OperandX64 source, IrOp s
    LUAU_ASSERT(instIdx != kInvalidInstIdx || sourceOp.kind == IrOpKind::None);

    LUAU_ASSERT(argCount < kMaxCallArguments);
-    args[argCount++] = {targetSize, source, sourceOp};
+    CallArgument& arg = args[argCount++];
+    arg = {targetSize, source, sourceOp};
+
+    arg.target = getNextArgumentTarget(targetSize);
+
+    if (build.abi == ABIX64::Windows)
+    {
+        // On Windows, gpr/xmm register positions move in sync
+        gprPos++;
+        xmmPos++;
+    }
+    else
+    {
+        if (targetSize == SizeX64::xmmword)
+            xmmPos++;
+        else
+            gprPos++;
+    }
 }

 void IrCallWrapperX64::addArgument(SizeX64 targetSize, ScopedRegX64& scopedReg)
 {
-    LUAU_ASSERT(argCount < kMaxCallArguments);
-    args[argCount++] = {targetSize, scopedReg.release(), {}};
+    addArgument(targetSize, scopedReg.release(), {});
 }

 void IrCallWrapperX64::call(const OperandX64& func)
 {
    funcOp = func;

-    assignTargetRegisters();
-
    countRegisterUses();

    for (int i = 0; i < argCount; ++i)
@ -190,44 +208,33 @@ void IrCallWrapperX64::call(const OperandX64& func)
    build.call(funcOp);
 }

-void IrCallWrapperX64::assignTargetRegisters()
+RegisterX64 IrCallWrapperX64::suggestNextArgumentRegister(SizeX64 size) const
 {
-    static const std::array<OperandX64, 6> kWindowsGprOrder = {rcx, rdx, r8, r9, addr[rsp + 32], addr[rsp + 40]};
-    static const std::array<OperandX64, 6> kSystemvGprOrder = {rdi, rsi, rdx, rcx, r8, r9};
+    OperandX64 target = getNextArgumentTarget(size);
+
+    return target.cat == CategoryX64::reg ? regs.takeReg(target.base, kInvalidInstIdx) : regs.allocReg(size, kInvalidInstIdx);
+}
+
+OperandX64 IrCallWrapperX64::getNextArgumentTarget(SizeX64 size) const
+{
+    if (size == SizeX64::xmmword)
+    {
+        LUAU_ASSERT(size_t(xmmPos) < kXmmOrder.size());
+        return kXmmOrder[xmmPos];
+    }

    const std::array<OperandX64, 6>& gprOrder = build.abi == ABIX64::Windows ? kWindowsGprOrder : kSystemvGprOrder;
-    static const std::array<OperandX64, 4> kXmmOrder = {xmm0, xmm1, xmm2, xmm3}; // Common order for first 4 fp arguments on Windows/SystemV

-    int gprPos = 0;
-    int xmmPos = 0;
+    LUAU_ASSERT(size_t(gprPos) < gprOrder.size());
+    OperandX64 target = gprOrder[gprPos];

-    for (int i = 0; i < argCount; i++)
-    {
-        CallArgument& arg = args[i];
+    // Keep requested argument size
+    if (target.cat == CategoryX64::reg)
+        target.base.size = size;
+    else if (target.cat == CategoryX64::mem)
+        target.memSize = size;

-        if (arg.targetSize == SizeX64::xmmword)
-        {
-            LUAU_ASSERT(size_t(xmmPos) < kXmmOrder.size());
-            arg.target = kXmmOrder[xmmPos++];
-
-            if (build.abi == ABIX64::Windows)
-                gprPos++; // On Windows, gpr/xmm register positions move in sync
-        }
-        else
-        {
-            LUAU_ASSERT(size_t(gprPos) < gprOrder.size());
-            arg.target = gprOrder[gprPos++];
-
-            if (build.abi == ABIX64::Windows)
-                xmmPos++; // On Windows, gpr/xmm register positions move in sync
-
-            // Keep requested argument size
-            if (arg.target.cat == CategoryX64::reg)
-                arg.target.base.size = arg.targetSize;
-            else if (arg.target.cat == CategoryX64::mem)
-                arg.target.memSize = arg.targetSize;
-        }
-    }
+    return target;
 }

 void IrCallWrapperX64::countRegisterUses()
@ -376,7 +383,7 @@ RegisterX64 IrCallWrapperX64::findConflictingTarget() const
 void IrCallWrapperX64::renameConflictingRegister(RegisterX64 conflict)
 {
    // Get a fresh register
-    RegisterX64 freshReg = conflict.size == SizeX64::xmmword ? regs.allocXmmReg(kInvalidInstIdx) : regs.allocGprReg(conflict.size, kInvalidInstIdx);
+    RegisterX64 freshReg = regs.allocReg(conflict.size, kInvalidInstIdx);

    if (conflict.size == SizeX64::xmmword)
        build.vmovsd(freshReg, conflict, conflict);
--- a/CodeGen/src/IrLoweringA64.cpp
+++ b/CodeGen/src/IrLoweringA64.cpp
@ -8,7 +8,6 @@
 #include "Luau/IrUtils.h"

 #include "EmitCommonA64.h"
-#include "EmitInstructionA64.h"
 #include "NativeState.h"

 #include "lstate.h"
@ -27,13 +26,14 @@ namespace A64
 #ifdef TRACE
 struct LoweringStatsA64
 {
-    size_t can;
+    size_t missing;
    size_t total;

    ~LoweringStatsA64()
    {
        if (total)
-            printf("A64 lowering succeeded for %.1f%% functions (%d/%d)\n", double(can) / double(total) * 100, int(can), int(total));
+            printf("A64 lowering succeeded for %.1f%% functions (%d/%d)\n", double(total - missing) / double(total) * 100, int(total - missing),
+                int(total));
    }
 } gStatsA64;
 #endif
@ -78,32 +78,230 @@ inline ConditionA64 getConditionFP(IrCondition cond)
    }
 }

-// TODO: instead of temp1/temp2 we can take a register that we will use for ra->value; that way callers to this function will be able to use it when
-// calling luaC_barrier*
-static void checkObjectBarrierConditions(AssemblyBuilderA64& build, RegisterA64 object, RegisterA64 temp1, RegisterA64 temp2, int ra, Label& skip)
+static void checkObjectBarrierConditions(AssemblyBuilderA64& build, RegisterA64 object, RegisterA64 temp, int ra, Label& skip)
 {
-    RegisterA64 temp1w = castReg(KindA64::w, temp1);
-    RegisterA64 temp2w = castReg(KindA64::w, temp2);
+    RegisterA64 tempw = castReg(KindA64::w, temp);

    // iscollectable(ra)
-    build.ldr(temp1w, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, tt)));
-    build.cmp(temp1w, LUA_TSTRING);
+    build.ldr(tempw, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, tt)));
+    build.cmp(tempw, LUA_TSTRING);
    build.b(ConditionA64::Less, skip);

    // isblack(obj2gco(o))
    // TODO: conditional bit test with BLACKBIT
-    build.ldrb(temp1w, mem(object, offsetof(GCheader, marked)));
-    build.mov(temp2w, bitmask(BLACKBIT));
-    build.and_(temp1w, temp1w, temp2w);
-    build.cbz(temp1w, skip);
+    build.ldrb(tempw, mem(object, offsetof(GCheader, marked)));
+    build.tst(tempw, bitmask(BLACKBIT));
+    build.b(ConditionA64::Equal, skip); // Equal = Zero after tst

    // iswhite(gcvalue(ra))
-    // TODO: tst with bitmask(WHITE0BIT, WHITE1BIT)
-    build.ldr(temp1, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, value)));
-    build.ldrb(temp1w, mem(temp1, offsetof(GCheader, marked)));
-    build.mov(temp2w, bit2mask(WHITE0BIT, WHITE1BIT));
-    build.and_(temp1w, temp1w, temp2w);
-    build.cbz(temp1w, skip);
+    build.ldr(temp, mem(rBase, ra * sizeof(TValue) + offsetof(TValue, value)));
+    build.ldrb(tempw, mem(temp, offsetof(GCheader, marked)));
+    build.tst(tempw, bit2mask(WHITE0BIT, WHITE1BIT));
+    build.b(ConditionA64::Equal, skip); // Equal = Zero after tst
+}
+
+static void emitAddOffset(AssemblyBuilderA64& build, RegisterA64 dst, RegisterA64 src, size_t offset)
+{
+    LUAU_ASSERT(dst != src);
+    LUAU_ASSERT(offset <= INT_MAX);
+
+    if (offset <= AssemblyBuilderA64::kMaxImmediate)
+    {
+        build.add(dst, src, uint16_t(offset));
+    }
+    else
+    {
+        build.mov(dst, int(offset));
+        build.add(dst, dst, src);
+    }
+}
+
+static void emitFallback(AssemblyBuilderA64& build, int op, int pcpos)
+{
+    // fallback(L, instruction, base, k)
+    build.mov(x0, rState);
+    emitAddOffset(build, x1, rCode, pcpos * sizeof(Instruction));
+    build.mov(x2, rBase);
+    build.mov(x3, rConstants);
+    build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, fallback) + op * sizeof(FallbackFn)));
+    build.blr(x4);
+
+    emitUpdateBase(build);
+}
+
+static void emitInvokeLibm1(AssemblyBuilderA64& build, size_t func, int res, int arg)
+{
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.ldr(x0, mem(rNativeContext, uint32_t(func)));
+    build.blr(x0);
+    build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+}
+
+static void emitInvokeLibm2(AssemblyBuilderA64& build, size_t func, int res, int arg, IrOp args, bool argsInt = false)
+{
+    if (args.kind == IrOpKind::VmReg)
+        build.ldr(d1, mem(rBase, args.index * sizeof(TValue) + offsetof(TValue, value.n)));
+    else if (args.kind == IrOpKind::VmConst)
+    {
+        size_t constantOffset = args.index * sizeof(TValue) + offsetof(TValue, value.n);
+
+        // Note: cumulative offset is guaranteed to be divisible by 8 (since we're loading a double); we can use that to expand the useful range that
+        // doesn't require temporaries
+        if (constantOffset / 8 <= AddressA64::kMaxOffset)
+        {
+            build.ldr(d1, mem(rConstants, int(constantOffset)));
+        }
+        else
+        {
+            emitAddOffset(build, x0, rConstants, constantOffset);
+            build.ldr(d1, x0);
+        }
+    }
+    else
+        LUAU_ASSERT(!"Unsupported instruction form");
+
+    if (argsInt)
+        build.fcvtzs(w0, d1);
+
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.ldr(x1, mem(rNativeContext, uint32_t(func)));
+    build.blr(x1);
+    build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+}
+
+static void emitInvokeLibm1P(AssemblyBuilderA64& build, size_t func, int arg)
+{
+    build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+    build.add(x0, sp, sTemporary.data); // sp-relative offset
+    build.ldr(x1, mem(rNativeContext, uint32_t(func)));
+    build.blr(x1);
+}
+
+static bool emitBuiltin(AssemblyBuilderA64& build, IrRegAllocA64& regs, int bfid, int res, int arg, IrOp args, int nparams, int nresults)
+{
+    switch (bfid)
+    {
+    case LBF_MATH_EXP:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_exp), res, arg);
+        return true;
+    case LBF_MATH_FMOD:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_fmod), res, arg, args);
+        return true;
+    case LBF_MATH_ASIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_asin), res, arg);
+        return true;
+    case LBF_MATH_SIN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_sin), res, arg);
+        return true;
+    case LBF_MATH_SINH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_sinh), res, arg);
+        return true;
+    case LBF_MATH_ACOS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_acos), res, arg);
+        return true;
+    case LBF_MATH_COS:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_cos), res, arg);
+        return true;
+    case LBF_MATH_COSH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_cosh), res, arg);
+        return true;
+    case LBF_MATH_ATAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_atan), res, arg);
+        return true;
+    case LBF_MATH_TAN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_tan), res, arg);
+        return true;
+    case LBF_MATH_TANH:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_tanh), res, arg);
+        return true;
+    case LBF_MATH_ATAN2:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_atan2), res, arg, args);
+        return true;
+    case LBF_MATH_LOG10:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        emitInvokeLibm1(build, offsetof(NativeContext, libm_log10), res, arg);
+        return true;
+    case LBF_MATH_LOG:
+        LUAU_ASSERT((nparams == 1 || nparams == 2) && nresults == 1);
+        // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+        if (nparams == 2)
+            emitInvokeLibm1(build, offsetof(NativeContext, libm_log2), res, arg);
+        else
+            emitInvokeLibm1(build, offsetof(NativeContext, libm_log), res, arg);
+        return true;
+    case LBF_MATH_LDEXP:
+        LUAU_ASSERT(nparams == 2 && nresults == 1);
+        emitInvokeLibm2(build, offsetof(NativeContext, libm_ldexp), res, arg, args, /* argsInt= */ true);
+        return true;
+    case LBF_MATH_FREXP:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
+        emitInvokeLibm1P(build, offsetof(NativeContext, libm_frexp), arg);
+        build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        if (nresults == 2)
+        {
+            build.ldr(w0, sTemporary);
+            build.scvtf(d1, w0);
+            build.str(d1, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
+        }
+        return true;
+    case LBF_MATH_MODF:
+        LUAU_ASSERT(nparams == 1 && (nresults == 1 || nresults == 2));
+        emitInvokeLibm1P(build, offsetof(NativeContext, libm_modf), arg);
+        build.ldr(d1, sTemporary);
+        build.str(d1, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        if (nresults == 2)
+            build.str(d0, mem(rBase, (res + 1) * sizeof(TValue) + offsetof(TValue, value.n)));
+        return true;
+    case LBF_MATH_SIGN:
+        LUAU_ASSERT(nparams == 1 && nresults == 1);
+        // TODO: this can be improved with fmov(constant), for now we just load from memory
+        build.ldr(d0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, value.n)));
+        build.fcmpz(d0);
+        build.adr(x0, 0.0);
+        build.ldr(d0, x0);
+        build.adr(x0, 1.0);
+        build.ldr(d1, x0);
+        build.fcsel(d0, d1, d0, getConditionFP(IrCondition::Greater));
+        build.adr(x0, -1.0);
+        build.ldr(d1, x0);
+        build.fcsel(d0, d1, d0, getConditionFP(IrCondition::Less));
+        build.str(d0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.n)));
+        return true;
+
+    case LBF_TYPE:
+        build.ldr(w0, mem(rBase, arg * sizeof(TValue) + offsetof(TValue, tt)));
+        build.ldr(x1, mem(rState, offsetof(lua_State, global)));
+        // TODO: this can use load with shifted/extended offset
+        LUAU_ASSERT(sizeof(TString*) == 8);
+        build.add(x1, x1, zextReg(w0), 3);
+        build.ldr(x0, mem(x1, offsetof(global_State, ttname)));
+        build.str(x0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.gc)));
+        return true;
+
+    case LBF_TYPEOF:
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(arg * sizeof(TValue)));
+        build.ldr(x2, mem(rNativeContext, offsetof(NativeContext, luaT_objtypenamestr)));
+        build.blr(x2);
+        build.str(x0, mem(rBase, res * sizeof(TValue) + offsetof(TValue, value.gc)));
+        return true;
+
+    default:
+        LUAU_ASSERT(!"Missing A64 lowering");
+        return false;
+    }
 }

 IrLoweringA64::IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers, NativeState& data, Proto* proto, IrFunction& function)
@ -116,119 +314,10 @@ IrLoweringA64::IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers,
 {
    // In order to allocate registers during lowering, we need to know where instruction results are last used
    updateLastUseLocations(function);
-}

-// TODO: Eventually this can go away
-bool IrLoweringA64::canLower(const IrFunction& function)
-{
 #ifdef TRACE
    gStatsA64.total++;
 #endif
-
-    for (const IrInst& inst : function.instructions)
-    {
-        switch (inst.cmd)
-        {
-        case IrCmd::NOP:
-        case IrCmd::LOAD_TAG:
-        case IrCmd::LOAD_POINTER:
-        case IrCmd::LOAD_DOUBLE:
-        case IrCmd::LOAD_INT:
-        case IrCmd::LOAD_TVALUE:
-        case IrCmd::LOAD_NODE_VALUE_TV:
-        case IrCmd::LOAD_ENV:
-        case IrCmd::GET_ARR_ADDR:
-        case IrCmd::GET_SLOT_NODE_ADDR:
-        case IrCmd::GET_HASH_NODE_ADDR:
-        case IrCmd::STORE_TAG:
-        case IrCmd::STORE_POINTER:
-        case IrCmd::STORE_DOUBLE:
-        case IrCmd::STORE_INT:
-        case IrCmd::STORE_TVALUE:
-        case IrCmd::STORE_NODE_VALUE_TV:
-        case IrCmd::ADD_INT:
-        case IrCmd::SUB_INT:
-        case IrCmd::ADD_NUM:
-        case IrCmd::SUB_NUM:
-        case IrCmd::MUL_NUM:
-        case IrCmd::DIV_NUM:
-        case IrCmd::MOD_NUM:
-        case IrCmd::POW_NUM:
-        case IrCmd::MIN_NUM:
-        case IrCmd::MAX_NUM:
-        case IrCmd::UNM_NUM:
-        case IrCmd::FLOOR_NUM:
-        case IrCmd::CEIL_NUM:
-        case IrCmd::ROUND_NUM:
-        case IrCmd::SQRT_NUM:
-        case IrCmd::ABS_NUM:
-        case IrCmd::JUMP:
-        case IrCmd::JUMP_IF_TRUTHY:
-        case IrCmd::JUMP_IF_FALSY:
-        case IrCmd::JUMP_EQ_TAG:
-        case IrCmd::JUMP_EQ_INT:
-        case IrCmd::JUMP_EQ_POINTER:
-        case IrCmd::JUMP_CMP_NUM:
-        case IrCmd::JUMP_CMP_ANY:
-        case IrCmd::TABLE_LEN:
-        case IrCmd::NEW_TABLE:
-        case IrCmd::DUP_TABLE:
-        case IrCmd::TRY_NUM_TO_INDEX:
-        case IrCmd::INT_TO_NUM:
-        case IrCmd::ADJUST_STACK_TO_REG:
-        case IrCmd::ADJUST_STACK_TO_TOP:
-        case IrCmd::INVOKE_FASTCALL:
-        case IrCmd::CHECK_FASTCALL_RES:
-        case IrCmd::DO_ARITH:
-        case IrCmd::DO_LEN:
-        case IrCmd::GET_TABLE:
-        case IrCmd::SET_TABLE:
-        case IrCmd::GET_IMPORT:
-        case IrCmd::CONCAT:
-        case IrCmd::GET_UPVALUE:
-        case IrCmd::SET_UPVALUE:
-        case IrCmd::PREPARE_FORN:
-        case IrCmd::CHECK_TAG:
-        case IrCmd::CHECK_READONLY:
-        case IrCmd::CHECK_NO_METATABLE:
-        case IrCmd::CHECK_SAFE_ENV:
-        case IrCmd::CHECK_ARRAY_SIZE:
-        case IrCmd::CHECK_SLOT_MATCH:
-        case IrCmd::INTERRUPT:
-        case IrCmd::CHECK_GC:
-        case IrCmd::BARRIER_OBJ:
-        case IrCmd::BARRIER_TABLE_BACK:
-        case IrCmd::BARRIER_TABLE_FORWARD:
-        case IrCmd::SET_SAVEDPC:
-        case IrCmd::CLOSE_UPVALS:
-        case IrCmd::CAPTURE:
-        case IrCmd::CALL:
-        case IrCmd::RETURN:
-        case IrCmd::FALLBACK_GETGLOBAL:
-        case IrCmd::FALLBACK_SETGLOBAL:
-        case IrCmd::FALLBACK_GETTABLEKS:
-        case IrCmd::FALLBACK_SETTABLEKS:
-        case IrCmd::FALLBACK_NAMECALL:
-        case IrCmd::FALLBACK_PREPVARARGS:
-        case IrCmd::FALLBACK_GETVARARGS:
-        case IrCmd::FALLBACK_NEWCLOSURE:
-        case IrCmd::FALLBACK_DUPCLOSURE:
-        case IrCmd::SUBSTITUTE:
-            continue;
-
-        default:
-#ifdef TRACE
-            printf("A64 lowering missing %s\n", getCmdName(inst.cmd));
-#endif
-            return false;
-        }
-    }
-
-#ifdef TRACE
-    gStatsA64.can++;
-#endif
-
-    return true;
 }

 void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
@ -245,14 +334,14 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    case IrCmd::LOAD_POINTER:
    {
        inst.regA64 = regs.allocReg(KindA64::x);
-        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value));
+        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value.gc));
        build.ldr(inst.regA64, addr);
        break;
    }
    case IrCmd::LOAD_DOUBLE:
    {
        inst.regA64 = regs.allocReg(KindA64::d);
-        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value));
+        AddressA64 addr = tempAddr(inst.a, offsetof(TValue, value.n));
        build.ldr(inst.regA64, addr);
        break;
    }
@ -287,13 +376,21 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)

        if (inst.b.kind == IrOpKind::Inst)
        {
-            // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-            build.add(inst.regA64, inst.regA64, castReg(KindA64::x, regOp(inst.b)), kTValueSizeLog2);
+            build.add(inst.regA64, inst.regA64, zextReg(regOp(inst.b)), kTValueSizeLog2);
        }
        else if (inst.b.kind == IrOpKind::Constant)
        {
-            LUAU_ASSERT(size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate >> kTValueSizeLog2); // TODO: handle out of range values
-            build.add(inst.regA64, inst.regA64, uint16_t(intOp(inst.b) << kTValueSizeLog2));
+            // TODO: refactor into a common helper? can't use emitAddOffset because we need a temp register
+            if (intOp(inst.b) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
+            {
+                build.add(inst.regA64, inst.regA64, uint16_t(intOp(inst.b) * sizeof(TValue)));
+            }
+            else
+            {
+                RegisterA64 temp = regs.allocTemp(KindA64::x);
+                build.mov(temp, intOp(inst.b) * sizeof(TValue));
+                build.add(inst.regA64, inst.regA64, temp);
+            }
        }
        else
            LUAU_ASSERT(!"Unsupported instruction form");
@ -314,8 +411,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)

        // note: this may clobber inst.a, so it's important that we don't use it after this
        build.ldr(inst.regA64, mem(regOp(inst.a), offsetof(Table, node)));
-        // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-        build.add(inst.regA64, inst.regA64, castReg(KindA64::x, temp2), kLuaNodeSizeLog2);
+        build.add(inst.regA64, inst.regA64, zextReg(temp2), kLuaNodeSizeLog2);
        break;
    }
    case IrCmd::GET_HASH_NODE_ADDR:
@ -324,18 +420,16 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        RegisterA64 temp1 = regs.allocTemp(KindA64::w);
        RegisterA64 temp2 = regs.allocTemp(KindA64::w);

-        // TODO: this can use bic (andnot) to do hash & ~(-1 << lsizenode) instead but we don't support it yet
-        build.mov(temp1, 1);
+        // hash & ((1 << lsizenode) - 1) == hash & ~(-1 << lsizenode)
+        build.mov(temp1, -1);
        build.ldrb(temp2, mem(regOp(inst.a), offsetof(Table, lsizenode)));
        build.lsl(temp1, temp1, temp2);
-        build.sub(temp1, temp1, 1);
        build.mov(temp2, uintOp(inst.b));
-        build.and_(temp2, temp2, temp1);
+        build.bic(temp2, temp2, temp1);

        // note: this may clobber inst.a, so it's important that we don't use it after this
        build.ldr(inst.regA64, mem(regOp(inst.a), offsetof(Table, node)));
-        // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-        build.add(inst.regA64, inst.regA64, castReg(KindA64::x, temp2), kLuaNodeSizeLog2);
+        build.add(inst.regA64, inst.regA64, zextReg(temp2), kLuaNodeSizeLog2);
        break;
    }
    case IrCmd::STORE_TAG:
@ -501,6 +595,37 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.fabs(inst.regA64, temp);
        break;
    }
+    case IrCmd::NOT_ANY:
+    {
+        inst.regA64 = regs.allocReuse(KindA64::w, index, {inst.a, inst.b});
+
+        if (inst.a.kind == IrOpKind::Constant)
+        {
+            // other cases should've been constant folded
+            LUAU_ASSERT(tagOp(inst.a) == LUA_TBOOLEAN);
+            build.eor(inst.regA64, regOp(inst.b), 1);
+        }
+        else
+        {
+            Label notbool, exit;
+
+            // use the fact that NIL is the only value less than BOOLEAN to do two tag comparisons at once
+            LUAU_ASSERT(LUA_TNIL == 0 && LUA_TBOOLEAN == 1);
+            build.cmp(regOp(inst.a), LUA_TBOOLEAN);
+            build.b(ConditionA64::NotEqual, notbool);
+
+            // boolean => invert value
+            build.eor(inst.regA64, regOp(inst.b), 1);
+            build.b(exit);
+
+            // not boolean => result is true iff tag was nil
+            build.setLabel(notbool);
+            build.cset(inst.regA64, ConditionA64::Less);
+
+            build.setLabel(exit);
+        }
+        break;
+    }
    case IrCmd::JUMP:
        jumpOrFallthrough(blockOp(inst.a), next);
        break;
@ -537,10 +662,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::JUMP_EQ_TAG:
-        if (inst.b.kind == IrOpKind::Constant)
+        if (inst.a.kind == IrOpKind::Inst && inst.b.kind == IrOpKind::Constant)
            build.cmp(regOp(inst.a), tagOp(inst.b));
-        else if (inst.b.kind == IrOpKind::Inst)
+        else if (inst.a.kind == IrOpKind::Inst && inst.b.kind == IrOpKind::Inst)
            build.cmp(regOp(inst.a), regOp(inst.b));
+        else if (inst.a.kind == IrOpKind::Constant && inst.b.kind == IrOpKind::Inst)
+            build.cmp(regOp(inst.b), tagOp(inst.a));
        else
            LUAU_ASSERT(!"Unsupported instruction form");

@ -570,10 +697,20 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    {
        IrCondition cond = conditionOp(inst.c);

-        RegisterA64 temp1 = tempDouble(inst.a);
-        RegisterA64 temp2 = tempDouble(inst.b);
+        if (inst.b.kind == IrOpKind::Constant && doubleOp(inst.b) == 0.0)
+        {
+            RegisterA64 temp = tempDouble(inst.a);
+
+            build.fcmpz(temp);
+        }
+        else
+        {
+            RegisterA64 temp1 = tempDouble(inst.a);
+            RegisterA64 temp2 = tempDouble(inst.b);
+
+            build.fcmp(temp1, temp2);
+        }

-        build.fcmp(temp1, temp2);
        build.b(getConditionFP(cond), labelOp(inst.d));
        jumpOrFallthrough(blockOp(inst.e), next);
        break;
@ -607,6 +744,30 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        jumpOrFallthrough(blockOp(inst.e), next);
        break;
    }
+    case IrCmd::JUMP_SLOT_MATCH:
+    {
+        // TODO: share code with CHECK_SLOT_MATCH
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp1w = castReg(KindA64::w, temp1);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyTag));
+        build.and_(temp1w, temp1w, kLuaNodeTagMask);
+        build.cmp(temp1w, LUA_TSTRING);
+        build.b(ConditionA64::NotEqual, labelOp(inst.d));
+
+        AddressA64 addr = tempAddr(inst.b, offsetof(TValue, value));
+        build.ldr(temp1, mem(regOp(inst.a), offsetof(LuaNode, key.value)));
+        build.ldr(temp2, addr);
+        build.cmp(temp1, temp2);
+        build.b(ConditionA64::NotEqual, labelOp(inst.d));
+
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, val.tt)));
+        LUAU_ASSERT(LUA_TNIL == 0);
+        build.cbz(temp1w, labelOp(inst.d));
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;
+    }
    case IrCmd::TABLE_LEN:
    {
        regs.assertAllFreeExcept(regOp(inst.a));
@ -664,6 +825,32 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    }
+    case IrCmd::TRY_CALL_FASTGETTM:
+    {
+        regs.assertAllFreeExcept(regOp(inst.a));
+
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+
+        build.ldr(temp1, mem(regOp(inst.a), offsetof(Table, metatable)));
+        build.cbz(temp1, labelOp(inst.c)); // no metatable
+
+        build.ldrb(temp2, mem(temp1, offsetof(Table, tmcache)));
+        build.tst(temp2, 1 << intOp(inst.b));             // can't use tbz/tbnz because their jump offsets are too short
+        build.b(ConditionA64::NotEqual, labelOp(inst.c)); // Equal = Zero after tst; tmcache caches *absence* of metamethods
+
+        build.mov(x0, temp1);
+        build.mov(w1, intOp(inst.b));
+        build.ldr(x2, mem(rState, offsetof(lua_State, global)));
+        build.ldr(x2, mem(x2, offsetof(global_State, tmname) + intOp(inst.b) * sizeof(TString*)));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, luaT_gettm)));
+        build.blr(x3);
+
+        // TODO: we could takeReg x0 but it's unclear if we will be able to keep x0 allocatable due to aliasing concerns
+        inst.regA64 = regs.allocReg(KindA64::x);
+        build.mov(inst.regA64, x0);
+        break;
+    }
    case IrCmd::INT_TO_NUM:
    {
        inst.regA64 = regs.allocReg(KindA64::d);
@ -683,8 +870,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        else if (inst.b.kind == IrOpKind::Inst)
        {
            build.add(temp, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
-            // TODO: This is a temporary hack that reads wN register as if it was xN. This should use unsigned extension shift once we support it.
-            build.add(temp, temp, castReg(KindA64::x, regOp(inst.b)), kTValueSizeLog2);
+            build.add(temp, temp, zextReg(regOp(inst.b)), kTValueSizeLog2);
            build.str(temp, mem(rState, offsetof(lua_State, top)));
        }
        else
@ -699,6 +885,12 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.str(temp, mem(rState, offsetof(lua_State, top)));
        break;
    }
+    case IrCmd::FASTCALL:
+        regs.assertAllFree();
+        // TODO: emitBuiltin should be exhaustive
+        if (!emitBuiltin(build, regs, uintOp(inst.a), vmRegOp(inst.b), vmRegOp(inst.c), inst.d, intOp(inst.e), intOp(inst.f)))
+            error = true;
+        break;
    case IrCmd::INVOKE_FASTCALL:
    {
        regs.assertAllFree();
@ -710,18 +902,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        if (inst.d.kind == IrOpKind::VmReg)
            build.add(x4, rBase, uint16_t(vmRegOp(inst.d) * sizeof(TValue)));
        else if (inst.d.kind == IrOpKind::VmConst)
-        {
-            // TODO: refactor into a common helper
-            if (vmConstOp(inst.d) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
-            {
-                build.add(x4, rConstants, uint16_t(vmConstOp(inst.d) * sizeof(TValue)));
-            }
-            else
-            {
-                build.mov(x4, vmConstOp(inst.d) * sizeof(TValue));
-                build.add(x4, rConstants, x4);
-            }
-        }
+            emitAddOffset(build, x4, rConstants, vmConstOp(inst.d) * sizeof(TValue));
        else
            LUAU_ASSERT(boolOp(inst.d) == false);

@ -742,7 +923,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.ldr(x6, mem(rNativeContext, offsetof(NativeContext, luauF_table) + uintOp(inst.a) * sizeof(luau_FastFunction)));
        build.blr(x6);

-        // TODO: we could takeReg w0 but it's unclear if we will be able to keep x0 allocatable due to aliasing concerns
+        // since w0 came from a call, we need to move it so that we don't violate zextReg safety contract
        inst.regA64 = regs.allocReg(KindA64::w);
        build.mov(inst.regA64, w0);
        break;
@ -758,18 +939,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.add(x2, rBase, uint16_t(vmRegOp(inst.b) * sizeof(TValue)));

        if (inst.c.kind == IrOpKind::VmConst)
-        {
-            // TODO: refactor into a common helper
-            if (vmConstOp(inst.c) * sizeof(TValue) <= AssemblyBuilderA64::kMaxImmediate)
-            {
-                build.add(x3, rConstants, uint16_t(vmConstOp(inst.c) * sizeof(TValue)));
-            }
-            else
-            {
-                build.mov(x3, vmConstOp(inst.c) * sizeof(TValue));
-                build.add(x3, rConstants, x3);
-            }
-        }
+            emitAddOffset(build, x3, rConstants, vmConstOp(inst.c) * sizeof(TValue));
        else
            build.add(x3, rBase, uint16_t(vmRegOp(inst.c) * sizeof(TValue)));

@ -835,7 +1005,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    case IrCmd::GET_IMPORT:
        regs.assertAllFree();
-        emitInstGetImport(build, vmRegOp(inst.a), uintOp(inst.b));
+        // luaV_getimport(L, cl->env, k, aux, /* propagatenil= */ false)
+        build.mov(x0, rState);
+        build.ldr(x1, mem(rClosure, offsetof(Closure, env)));
+        build.mov(x2, rConstants);
+        build.mov(w3, uintOp(inst.b));
+        build.mov(w4, 0);
+        build.ldr(x5, mem(rNativeContext, offsetof(NativeContext, luaV_getimport)));
+        build.blr(x5);
+
+        emitUpdateBase(build);
+
+        // setobj2s(L, ra, L->top - 1)
+        build.ldr(x0, mem(rState, offsetof(lua_State, top)));
+        build.sub(x0, x0, sizeof(TValue));
+        build.ldr(q0, x0);
+        build.str(q0, mem(rBase, vmRegOp(inst.a) * sizeof(TValue)));
+
+        // L->top--
+        build.str(x0, mem(rState, offsetof(lua_State, top)));
        break;
    case IrCmd::CONCAT:
        regs.assertAllFree();
@ -877,7 +1065,6 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
        RegisterA64 temp3 = regs.allocTemp(KindA64::q);
-        RegisterA64 temp4 = regs.allocTemp(KindA64::x);

        // UpVal*
        build.ldr(temp1, mem(rClosure, offsetof(Closure, l.uprefs) + sizeof(TValue) * vmUpvalueOp(inst.a) + offsetof(TValue, value.gc)));
@ -887,7 +1074,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.str(temp3, temp2);

        Label skip;
-        checkObjectBarrierConditions(build, temp1, temp2, temp4, vmRegOp(inst.b), skip);
+        checkObjectBarrierConditions(build, temp1, temp2, vmRegOp(inst.b), skip);

        build.mov(x0, rState);
        build.mov(x1, temp1); // TODO: aliasing hazard
@ -945,8 +1132,17 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            build.cmp(temp, regOp(inst.b));
        else if (inst.b.kind == IrOpKind::Constant)
        {
-            LUAU_ASSERT(size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate); // TODO: handle out of range values
-            build.cmp(temp, uint16_t(intOp(inst.b)));
+            // TODO: refactor into a common helper?
+            if (size_t(intOp(inst.b)) <= AssemblyBuilderA64::kMaxImmediate)
+            {
+                build.cmp(temp, uint16_t(intOp(inst.b)));
+            }
+            else
+            {
+                RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+                build.mov(temp2, intOp(inst.b));
+                build.cmp(temp, temp2);
+            }
        }
        else
            LUAU_ASSERT(!"Unsupported instruction form");
@ -959,12 +1155,9 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
        RegisterA64 temp1w = castReg(KindA64::w, temp1);
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2w = castReg(KindA64::w, temp2);

-        build.ldr(temp1w, mem(regOp(inst.a), kOffsetOfLuaNodeTag));
-        // TODO: this needs bitfield extraction, or and-immediate
-        build.mov(temp2w, kLuaNodeTagMask);
-        build.and_(temp1w, temp1w, temp2w);
+        build.ldr(temp1w, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyTag));
+        build.and_(temp1w, temp1w, kLuaNodeTagMask);
        build.cmp(temp1w, LUA_TSTRING);
        build.b(ConditionA64::NotEqual, labelOp(inst.c));

@ -979,6 +1172,15 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.cbz(temp1w, labelOp(inst.c));
        break;
    }
+    case IrCmd::CHECK_NODE_NO_NEXT:
+    {
+        RegisterA64 temp = regs.allocTemp(KindA64::w);
+
+        build.ldr(temp, mem(regOp(inst.a), offsetof(LuaNode, key) + kOffsetOfTKeyNext));
+        build.and_(temp, temp, ~((1u << kNextBitOffset) - 1)); // TODO: this would be cleaner with a right shift
+        build.cbnz(temp, labelOp(inst.b));
+        break;
+    }
    case IrCmd::INTERRUPT:
    {
        unsigned int pcpos = uintOp(inst.a);
@ -1023,11 +1225,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    {
        regs.assertAllFreeExcept(regOp(inst.a));

-        Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp = regs.allocTemp(KindA64::x);

-        checkObjectBarrierConditions(build, regOp(inst.a), temp1, temp2, vmRegOp(inst.b), skip);
+        Label skip;
+        checkObjectBarrierConditions(build, regOp(inst.a), temp, vmRegOp(inst.b), skip);

        build.mov(x0, rState);
        build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard
@ -1044,15 +1245,13 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        regs.assertAllFreeExcept(regOp(inst.a));

        Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::w);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+        RegisterA64 temp = regs.allocTemp(KindA64::w);

        // isblack(obj2gco(t))
-        build.ldrb(temp1, mem(regOp(inst.a), offsetof(GCheader, marked)));
+        build.ldrb(temp, mem(regOp(inst.a), offsetof(GCheader, marked)));
        // TODO: conditional bit test with BLACKBIT
-        build.mov(temp2, bitmask(BLACKBIT));
-        build.and_(temp1, temp1, temp2);
-        build.cbz(temp1, skip);
+        build.tst(temp, bitmask(BLACKBIT));
+        build.b(ConditionA64::Equal, skip); // Equal = Zero after tst

        build.mov(x0, rState);
        build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard here and below
@ -1068,11 +1267,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    {
        regs.assertAllFreeExcept(regOp(inst.a));

-        Label skip;
-        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
-        RegisterA64 temp2 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp = regs.allocTemp(KindA64::x);

-        checkObjectBarrierConditions(build, regOp(inst.a), temp1, temp2, vmRegOp(inst.b), skip);
+        Label skip;
+        checkObjectBarrierConditions(build, regOp(inst.a), temp, vmRegOp(inst.b), skip);

        build.mov(x0, rState);
        build.mov(x1, regOp(inst.a)); // TODO: aliasing hazard
@ -1086,21 +1284,10 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    }
    case IrCmd::SET_SAVEDPC:
    {
-        unsigned int pcpos = uintOp(inst.a);
        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
        RegisterA64 temp2 = regs.allocTemp(KindA64::x);

-        // TODO: refactor into a common helper
-        if (pcpos * sizeof(Instruction) <= AssemblyBuilderA64::kMaxImmediate)
-        {
-            build.add(temp1, rCode, uint16_t(pcpos * sizeof(Instruction)));
-        }
-        else
-        {
-            build.mov(temp1, pcpos * sizeof(Instruction));
-            build.add(temp1, rCode, temp1);
-        }
-
+        emitAddOffset(build, temp1, rCode, uintOp(inst.a) * sizeof(Instruction));
        build.ldr(temp2, mem(rState, offsetof(lua_State, ci)));
        build.str(temp1, mem(temp2, offsetof(CallInfo, savedpc)));
        break;
@ -1133,14 +1320,100 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    case IrCmd::CAPTURE:
        // no-op
        break;
+    case IrCmd::SETLIST:
+        regs.assertAllFree();
+        emitFallback(build, LOP_SETLIST, uintOp(inst.a));
+        break;
    case IrCmd::CALL:
        regs.assertAllFree();
-        emitInstCall(build, helpers, vmRegOp(inst.a), intOp(inst.b), intOp(inst.c));
+        // argtop = (nparams == LUA_MULTRET) ? L->top : ra + 1 + nparams;
+        if (intOp(inst.b) == LUA_MULTRET)
+            build.ldr(x2, mem(rState, offsetof(lua_State, top)));
+        else
+            build.add(x2, rBase, uint16_t((vmRegOp(inst.a) + 1 + intOp(inst.b)) * sizeof(TValue)));
+
+        // callFallback(L, ra, argtop, nresults)
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.mov(w3, intOp(inst.c));
+        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, callFallback)));
+        build.blr(x4);
+
+        // reentry with x0=closure (NULL will trigger exit)
+        build.b(helpers.reentry);
        break;
    case IrCmd::RETURN:
        regs.assertAllFree();
-        emitInstReturn(build, helpers, vmRegOp(inst.a), intOp(inst.b));
+        // callFallback(L, ra, n)
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.mov(w2, intOp(inst.b));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, returnFallback)));
+        build.blr(x3);
+
+        // reentry with x0=closure (NULL will trigger exit)
+        build.b(helpers.reentry);
        break;
+    case IrCmd::FORGLOOP:
+        // register layout: ra + 1 = table, ra + 2 = internal index, ra + 3 .. ra + aux = iteration variables
+        regs.assertAllFree();
+        // clear extra variables since we might have more than two
+        if (intOp(inst.b) > 2)
+        {
+            build.mov(w0, LUA_TNIL);
+            for (int i = 2; i < intOp(inst.b); ++i)
+                build.str(w0, mem(rBase, (vmRegOp(inst.a) + 3 + i) * sizeof(TValue) + offsetof(TValue, tt)));
+        }
+        // we use full iter fallback for now; in the future it could be worthwhile to accelerate array iteration here
+        build.mov(x0, rState);
+        build.ldr(x1, mem(rBase, (vmRegOp(inst.a) + 1) * sizeof(TValue) + offsetof(TValue, value.gc)));
+        build.ldr(w2, mem(rBase, (vmRegOp(inst.a) + 2) * sizeof(TValue) + offsetof(TValue, value.p)));
+        build.add(x3, rBase, uint16_t(vmRegOp(inst.a) * sizeof(TValue)));
+        build.ldr(x4, mem(rNativeContext, offsetof(NativeContext, forgLoopTableIter)));
+        build.blr(x4);
+        // note: no emitUpdateBase necessary because forgLoopTableIter does not reallocate stack
+        build.cbnz(w0, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
+        break;
+    case IrCmd::FORGLOOP_FALLBACK:
+        regs.assertAllFree();
+        build.mov(x0, rState);
+        build.mov(w1, vmRegOp(inst.a));
+        build.mov(w2, intOp(inst.b));
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgLoopNonTableFallback)));
+        build.blr(x3);
+        emitUpdateBase(build);
+        build.cbnz(w0, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
+        break;
+    case IrCmd::FORGPREP_XNEXT_FALLBACK:
+        regs.assertAllFree();
+        build.mov(x0, rState);
+        build.add(x1, rBase, uint16_t(vmRegOp(inst.b) * sizeof(TValue)));
+        build.mov(w2, uintOp(inst.a) + 1);
+        build.ldr(x3, mem(rNativeContext, offsetof(NativeContext, forgPrepXnextFallback)));
+        build.blr(x3);
+        // note: no emitUpdateBase necessary because forgLoopNonTableFallback does not reallocate stack
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;
+    case IrCmd::COVERAGE:
+    {
+        RegisterA64 temp1 = regs.allocTemp(KindA64::x);
+        RegisterA64 temp2 = regs.allocTemp(KindA64::w);
+        RegisterA64 temp3 = regs.allocTemp(KindA64::w);
+
+        build.mov(temp1, uintOp(inst.a) * sizeof(Instruction));
+        build.ldr(temp2, mem(rCode, temp1));
+
+        // increments E (high 24 bits); if the result overflows a 23-bit counter, high bit becomes 1
+        // note: cmp can be eliminated with adds but we aren't concerned with code size for coverage
+        build.add(temp3, temp2, 256);
+        build.cmp(temp3, 0);
+        build.csel(temp2, temp2, temp3, ConditionA64::Less);
+
+        build.str(temp2, mem(rCode, temp1));
+        break;
+    }

        // Full instruction fallbacks
    case IrCmd::FALLBACK_GETGLOBAL:
@ -1208,9 +1481,25 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        regs.assertAllFree();
        emitFallback(build, LOP_DUPCLOSURE, uintOp(inst.a));
        break;
+    case IrCmd::FALLBACK_FORGPREP:
+        regs.assertAllFree();
+        emitFallback(build, LOP_FORGPREP, uintOp(inst.a));
+        jumpOrFallthrough(blockOp(inst.c), next);
+        break;

-    default:
-        LUAU_ASSERT(!"Not supported yet");
+    // Pseudo instructions
+    case IrCmd::NOP:
+    case IrCmd::SUBSTITUTE:
+        LUAU_ASSERT(!"Pseudo instructions should not be lowered");
+        break;
+
+    // Unsupported instructions
+    // Note: when adding implementations for these, please move the case: label so that implemented instructions match the order in IrData.h
+    case IrCmd::STORE_VECTOR:
+#ifdef TRACE
+        gStatsA64.missing++;
+#endif
+        error = true;
        break;
    }

@ -1220,7 +1509,7 @@ void IrLoweringA64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)

 bool IrLoweringA64::hasError() const
 {
-    return false;
+    return error;
 }

 bool IrLoweringA64::isFallthroughBlock(IrBlock target, IrBlock next)
@ -1287,17 +1576,7 @@ AddressA64 IrLoweringA64::tempAddr(IrOp op, int offset)

        RegisterA64 temp = regs.allocTemp(KindA64::x);

-        // TODO: refactor into a common helper
-        if (constantOffset <= AssemblyBuilderA64::kMaxImmediate)
-        {
-            build.add(temp, rConstants, uint16_t(constantOffset));
-        }
-        else
-        {
-            build.mov(temp, int(constantOffset));
-            build.add(temp, rConstants, temp);
-        }
-
+        emitAddOffset(build, temp, rConstants, constantOffset);
        return temp;
    }
    // If we have a register, we assume it's a pointer to TValue
--- a/CodeGen/src/IrLoweringA64.h
+++ b/CodeGen/src/IrLoweringA64.h
@ -26,8 +26,6 @@ struct IrLoweringA64
 {
    IrLoweringA64(AssemblyBuilderA64& build, ModuleHelpers& helpers, NativeState& data, Proto* proto, IrFunction& function);

-    static bool canLower(const IrFunction& function);
-
    void lowerInst(IrInst& inst, uint32_t index, IrBlock& next);

    bool hasError() const;
@ -61,6 +59,8 @@ struct IrLoweringA64
    IrFunction& function;

    IrRegAllocA64 regs;
+
+    bool error = false;
 };

 } // namespace A64
--- a/CodeGen/src/IrLoweringX64.cpp
+++ b/CodeGen/src/IrLoweringX64.cpp
@ -31,6 +31,8 @@ IrLoweringX64::IrLoweringX64(AssemblyBuilderX64& build, ModuleHelpers& helpers,
 {
    // In order to allocate registers during lowering, we need to know where instruction results are last used
    updateLastUseLocations(function);
+
+    build.align(kFunctionAlignment, X64::AlignmentDataX64::Ud2);
 }

 void IrLoweringX64::storeDoubleAsFloat(OperandX64 dst, IrOp src)
@ -59,7 +61,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    switch (inst.cmd)
    {
    case IrCmd::LOAD_TAG:
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);

        if (inst.a.kind == IrOpKind::VmReg)
            build.mov(inst.regX64, luauRegTag(vmRegOp(inst.a)));
@ -73,7 +75,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            LUAU_ASSERT(!"Unsupported instruction form");
        break;
    case IrCmd::LOAD_POINTER:
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);

        if (inst.a.kind == IrOpKind::VmReg)
            build.mov(inst.regX64, luauRegValue(vmRegOp(inst.a)));
@ -87,7 +89,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            LUAU_ASSERT(!"Unsupported instruction form");
        break;
    case IrCmd::LOAD_DOUBLE:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);

        if (inst.a.kind == IrOpKind::VmReg)
            build.vmovsd(inst.regX64, luauRegValue(vmRegOp(inst.a)));
@ -97,12 +99,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            LUAU_ASSERT(!"Unsupported instruction form");
        break;
    case IrCmd::LOAD_INT:
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);

        build.mov(inst.regX64, luauRegValueInt(vmRegOp(inst.a)));
        break;
    case IrCmd::LOAD_TVALUE:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);

        if (inst.a.kind == IrOpKind::VmReg)
            build.vmovups(inst.regX64, luauReg(vmRegOp(inst.a)));
@ -114,12 +116,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            LUAU_ASSERT(!"Unsupported instruction form");
        break;
    case IrCmd::LOAD_NODE_VALUE_TV:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);

        build.vmovups(inst.regX64, luauNodeValue(regOp(inst.a)));
        break;
    case IrCmd::LOAD_ENV:
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);

        build.mov(inst.regX64, sClosure);
        build.mov(inst.regX64, qword[inst.regX64 + offsetof(Closure, env)]);
@ -127,7 +129,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    case IrCmd::GET_ARR_ADDR:
        if (inst.b.kind == IrOpKind::Inst)
        {
-            inst.regX64 = regs.allocGprRegOrReuse(SizeX64::qword, index, {inst.b});
+            inst.regX64 = regs.allocRegOrReuse(SizeX64::qword, index, {inst.b});

            if (dwordReg(inst.regX64) != regOp(inst.b))
                build.mov(dwordReg(inst.regX64), regOp(inst.b));
@ -137,7 +139,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        else if (inst.b.kind == IrOpKind::Constant)
        {
-            inst.regX64 = regs.allocGprRegOrReuse(SizeX64::qword, index, {inst.a});
+            inst.regX64 = regs.allocRegOrReuse(SizeX64::qword, index, {inst.a});

            build.mov(inst.regX64, qword[regOp(inst.a) + offsetof(Table, array)]);

@ -151,7 +153,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    case IrCmd::GET_SLOT_NODE_ADDR:
    {
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
+        inst.regX64 = regs.allocReg(SizeX64::qword, index);

        ScopedRegX64 tmp{regs, SizeX64::qword};

@ -160,11 +162,11 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    }
    case IrCmd::GET_HASH_NODE_ADDR:
    {
-        inst.regX64 = regs.allocGprReg(SizeX64::qword, index);
-
        // Custom bit shift value can only be placed in cl
        ScopedRegX64 shiftTmp{regs, regs.takeReg(rcx, kInvalidInstIdx)};

+        inst.regX64 = regs.allocReg(SizeX64::qword, index);
+
        ScopedRegX64 tmp{regs, SizeX64::qword};

        build.mov(inst.regX64, qword[regOp(inst.a) + offsetof(Table, node)]);
@ -232,7 +234,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        build.vmovups(luauNodeValue(regOp(inst.a)), regOp(inst.b));
        break;
    case IrCmd::ADD_INT:
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a});

        if (inst.regX64 == regOp(inst.a) && intOp(inst.b) == 1)
            build.inc(inst.regX64);
@ -242,7 +244,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            build.lea(inst.regX64, addr[regOp(inst.a) + intOp(inst.b)]);
        break;
    case IrCmd::SUB_INT:
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a});

        if (inst.regX64 == regOp(inst.a) && intOp(inst.b) == 1)
            build.dec(inst.regX64);
@ -252,7 +254,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
            build.lea(inst.regX64, addr[regOp(inst.a) - intOp(inst.b)]);
        break;
    case IrCmd::ADD_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -267,7 +269,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    case IrCmd::SUB_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -282,7 +284,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    case IrCmd::MUL_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -297,7 +299,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    case IrCmd::DIV_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -313,7 +315,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    case IrCmd::MOD_NUM:
    {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        ScopedRegX64 optLhsTmp{regs};
        RegisterX64 lhs;
@ -362,7 +364,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::MIN_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -377,7 +379,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    case IrCmd::MAX_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a, inst.b});

        if (inst.a.kind == IrOpKind::Constant)
        {
@ -393,7 +395,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    case IrCmd::UNM_NUM:
    {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        RegisterX64 src = regOp(inst.a);

@ -410,18 +412,18 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::FLOOR_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        build.vroundsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a), RoundingModeX64::RoundToNegativeInfinity);
        break;
    case IrCmd::CEIL_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        build.vroundsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a), RoundingModeX64::RoundToPositiveInfinity);
        break;
    case IrCmd::ROUND_NUM:
    {
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        ScopedRegX64 tmp1{regs, SizeX64::xmmword};
        ScopedRegX64 tmp2{regs, SizeX64::xmmword};
@ -439,12 +441,12 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::SQRT_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        build.vsqrtsd(inst.regX64, inst.regX64, memRegDoubleOp(inst.a));
        break;
    case IrCmd::ABS_NUM:
-        inst.regX64 = regs.allocXmmRegOrReuse(index, {inst.a});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::xmmword, index, {inst.a});

        if (inst.a.kind != IrOpKind::Inst)
            build.vmovsd(inst.regX64, memRegDoubleOp(inst.a));
@ -456,7 +458,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    case IrCmd::NOT_ANY:
    {
        // TODO: if we have a single user which is a STORE_INT, we are missing the opportunity to write directly to target
-        inst.regX64 = regs.allocGprRegOrReuse(SizeX64::dword, index, {inst.a, inst.b});
+        inst.regX64 = regs.allocRegOrReuse(SizeX64::dword, index, {inst.a, inst.b});

        Label saveone, savezero, exit;

@ -558,7 +560,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        callWrap.addArgument(SizeX64::qword, regOp(inst.a), inst.a);
        callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaH_getn)]);

-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);
        build.vcvtsi2sd(inst.regX64, inst.regX64, eax);
        break;
    }
@ -566,8 +568,8 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    {
        IrCallWrapperX64 callWrap(regs, build, index);
        callWrap.addArgument(SizeX64::qword, rState);
-        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.a)), inst.a);
-        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.b)), inst.b);
+        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.a)));
+        callWrap.addArgument(SizeX64::dword, int32_t(uintOp(inst.b)));
        callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaH_new)]);
        inst.regX64 = regs.takeReg(rax, index);
        break;
@ -583,7 +585,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
    }
    case IrCmd::TRY_NUM_TO_INDEX:
    {
-        inst.regX64 = regs.allocGprReg(SizeX64::dword, index);
+        inst.regX64 = regs.allocReg(SizeX64::dword, index);

        ScopedRegX64 tmp{regs, SizeX64::xmmword};

@ -620,7 +622,7 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::INT_TO_NUM:
-        inst.regX64 = regs.allocXmmReg(index);
+        inst.regX64 = regs.allocReg(SizeX64::xmmword, index);

        build.vcvtsi2sd(inst.regX64, inst.regX64, regOp(inst.a));
        break;
@ -688,11 +690,10 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)

        if (nparams == LUA_MULTRET)
        {
-            // Compute 'L->top - (ra + 1)', on SystemV, take r9 register to compute directly into the argument
-            // TODO: IrCallWrapperX64 should provide a way to 'guess' target argument register correctly
-            RegisterX64 reg = build.abi == ABIX64::Windows ? regs.allocGprReg(SizeX64::qword, kInvalidInstIdx) : regs.takeReg(rArg6, kInvalidInstIdx);
+            RegisterX64 reg = callWrap.suggestNextArgumentRegister(SizeX64::qword);
            ScopedRegX64 tmp{regs, SizeX64::qword};

+            // L->top - (ra + 1)
            build.mov(reg, qword[rState + offsetof(lua_State, top)]);
            build.lea(tmp.reg, addr[rBase + (ra + 1) * sizeof(TValue)]);
            build.sub(reg, tmp.reg);
@ -759,9 +760,35 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        }
        break;
    case IrCmd::GET_IMPORT:
-        regs.assertAllFree();
-        emitInstGetImportFallback(build, vmRegOp(inst.a), uintOp(inst.b));
+    {
+        ScopedRegX64 tmp1{regs, SizeX64::qword};
+
+        build.mov(tmp1.reg, sClosure);
+
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::qword, qword[tmp1.release() + offsetof(Closure, env)]);
+        callWrap.addArgument(SizeX64::qword, rConstants);
+        callWrap.addArgument(SizeX64::dword, uintOp(inst.b));
+        callWrap.addArgument(SizeX64::dword, 0);
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, luaV_getimport)]);
+
+        emitUpdateBase(build);
+
+        ScopedRegX64 tmp2{regs, SizeX64::qword};
+
+        // setobj2s(L, ra, L->top - 1)
+        build.mov(tmp2.reg, qword[rState + offsetof(lua_State, top)]);
+        build.sub(tmp2.reg, sizeof(TValue));
+
+        ScopedRegX64 tmp3{regs, SizeX64::xmmword};
+        build.vmovups(tmp3.reg, xmmword[tmp2.reg]);
+        build.vmovups(luauReg(vmRegOp(inst.a)), tmp3.reg);
+
+        // L->top--
+        build.mov(qword[rState + offsetof(lua_State, top)], tmp2.reg);
        break;
+    }
    case IrCmd::CONCAT:
    {
        IrCallWrapperX64 callWrap(regs, build, index);
@ -783,7 +810,6 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)

        // uprefs[] is either an actual value, or it points to UpVal object which has a pointer to value
        Label skip;
-        // TODO: jumpIfTagIsNot can be generalized to take OperandX64 and then we can use it here; let's wait until we see this more though
        build.cmp(dword[tmp1.reg + offsetof(TValue, tt)], LUA_TUPVAL);
        build.jcc(ConditionX64::NotEqual, skip);

@ -822,36 +848,25 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        callPrepareForN(regs, build, vmRegOp(inst.a), vmRegOp(inst.b), vmRegOp(inst.c));
        break;
    case IrCmd::CHECK_TAG:
-        if (inst.a.kind == IrOpKind::Inst)
-        {
-            build.cmp(regOp(inst.a), tagOp(inst.b));
-            build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
-        }
-        else if (inst.a.kind == IrOpKind::VmReg)
-        {
-            jumpIfTagIsNot(build, vmRegOp(inst.a), lua_Type(tagOp(inst.b)), labelOp(inst.c));
-        }
-        else if (inst.a.kind == IrOpKind::VmConst)
-        {
-            build.cmp(luauConstantTag(vmConstOp(inst.a)), tagOp(inst.b));
-            build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
-        }
-        else
-        {
-            LUAU_ASSERT(!"Unsupported instruction form");
-        }
+        build.cmp(memRegTagOp(inst.a), tagOp(inst.b));
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.c));
        break;
    case IrCmd::CHECK_READONLY:
-        jumpIfTableIsReadOnly(build, regOp(inst.a), labelOp(inst.b));
+        build.cmp(byte[regOp(inst.a) + offsetof(Table, readonly)], 0);
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.b));
        break;
    case IrCmd::CHECK_NO_METATABLE:
-        jumpIfMetatablePresent(build, regOp(inst.a), labelOp(inst.b));
+        build.cmp(qword[regOp(inst.a) + offsetof(Table, metatable)], 0);
+        build.jcc(ConditionX64::NotEqual, labelOp(inst.b));
        break;
    case IrCmd::CHECK_SAFE_ENV:
    {
        ScopedRegX64 tmp{regs, SizeX64::qword};

-        jumpIfUnsafeEnv(build, tmp.reg, labelOp(inst.a));
+        build.mov(tmp.reg, sClosure);
+        build.mov(tmp.reg, qword[tmp.reg + offsetof(Closure, env)]);
+        build.cmp(byte[tmp.reg + offsetof(Table, safeenv)], 0);
+        build.jcc(ConditionX64::Equal, labelOp(inst.a));
        break;
    }
    case IrCmd::CHECK_ARRAY_SIZE:
@ -872,11 +887,16 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    }
    case IrCmd::CHECK_NODE_NO_NEXT:
-        jumpIfNodeHasNext(build, regOp(inst.a), labelOp(inst.b));
+    {
+        ScopedRegX64 tmp{regs, SizeX64::dword};
+
+        build.mov(tmp.reg, dword[regOp(inst.a) + offsetof(LuaNode, key) + kOffsetOfTKeyNext]);
+        build.shr(tmp.reg, kNextBitOffset);
+        build.jcc(ConditionX64::NotZero, labelOp(inst.b));
        break;
+    }
    case IrCmd::INTERRUPT:
-        regs.assertAllFree();
-        emitInterrupt(build, uintOp(inst.a));
+        emitInterrupt(regs, build, uintOp(inst.a));
        break;
    case IrCmd::CHECK_GC:
        callStepGc(regs, build);
@ -970,94 +990,127 @@ void IrLoweringX64::lowerInst(IrInst& inst, uint32_t index, IrBlock& next)
        break;
    case IrCmd::FORGLOOP:
        regs.assertAllFree();
-        emitinstForGLoop(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c), labelOp(inst.d));
+        emitInstForGLoop(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
        break;
    case IrCmd::FORGLOOP_FALLBACK:
-        regs.assertAllFree();
-        emitinstForGLoopFallback(build, vmRegOp(inst.a), intOp(inst.b), labelOp(inst.c));
-        build.jmp(labelOp(inst.d));
+    {
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::dword, vmRegOp(inst.a));
+        callWrap.addArgument(SizeX64::dword, intOp(inst.b));
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, forgLoopNonTableFallback)]);
+
+        emitUpdateBase(build);
+
+        build.test(al, al);
+        build.jcc(ConditionX64::NotZero, labelOp(inst.c));
+        jumpOrFallthrough(blockOp(inst.d), next);
        break;
+    }
    case IrCmd::FORGPREP_XNEXT_FALLBACK:
-        regs.assertAllFree();
-        emitInstForGPrepXnextFallback(build, uintOp(inst.a), vmRegOp(inst.b), labelOp(inst.c));
+    {
+        IrCallWrapperX64 callWrap(regs, build, index);
+        callWrap.addArgument(SizeX64::qword, rState);
+        callWrap.addArgument(SizeX64::qword, luauRegAddress(vmRegOp(inst.b)));
+        callWrap.addArgument(SizeX64::dword, uintOp(inst.a) + 1);
+        callWrap.call(qword[rNativeContext + offsetof(NativeContext, forgPrepXnextFallback)]);
+        jumpOrFallthrough(blockOp(inst.c), next);
        break;
+    }
    case IrCmd::COVERAGE:
-        regs.assertAllFree();
-        emitInstCoverage(build, uintOp(inst.a));
+    {
+        ScopedRegX64 tmp1{regs, SizeX64::qword};
+        ScopedRegX64 tmp2{regs, SizeX64::dword};
+        ScopedRegX64 tmp3{regs, SizeX64::dword};
+
+        build.mov(tmp1.reg, sCode);
+        build.add(tmp1.reg, uintOp(inst.a) * sizeof(Instruction));
+
+        // hits = LUAU_INSN_E(*pc)
+        build.mov(tmp2.reg, dword[tmp1.reg]);
+        build.sar(tmp2.reg, 8);
+
+        // hits = (hits < (1 << 23) - 1) ? hits + 1 : hits;
+        build.xor_(tmp3.reg, tmp3.reg);
+        build.cmp(tmp2.reg, (1 << 23) - 1);
+        build.setcc(ConditionX64::NotEqual, byteReg(tmp3.reg));
+        build.add(tmp2.reg, tmp3.reg);
+
+        // VM_PATCH_E(pc, hits);
+        build.sal(tmp2.reg, 8);
+        build.movzx(tmp3.reg, byte[tmp1.reg]);
+        build.or_(tmp3.reg, tmp2.reg);
+        build.mov(dword[tmp1.reg], tmp3.reg);
        break;
+    }

        // Full instruction fallbacks
    case IrCmd::FALLBACK_GETGLOBAL:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETGLOBAL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETGLOBAL, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_SETGLOBAL:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_SETGLOBAL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_SETGLOBAL, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_GETTABLEKS:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETTABLEKS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETTABLEKS, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_SETTABLEKS:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_SETTABLEKS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_SETTABLEKS, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_NAMECALL:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.d.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_NAMECALL, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_NAMECALL, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_PREPVARARGS:
        LUAU_ASSERT(inst.b.kind == IrOpKind::Constant);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_PREPVARARGS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_PREPVARARGS, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_GETVARARGS:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::Constant);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_GETVARARGS, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_GETVARARGS, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_NEWCLOSURE:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::Constant);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_NEWCLOSURE, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_NEWCLOSURE, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_DUPCLOSURE:
        LUAU_ASSERT(inst.b.kind == IrOpKind::VmReg);
        LUAU_ASSERT(inst.c.kind == IrOpKind::VmConst);

-        regs.assertAllFree();
-        emitFallback(build, data, LOP_DUPCLOSURE, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_DUPCLOSURE, uintOp(inst.a));
        break;
    case IrCmd::FALLBACK_FORGPREP:
-        regs.assertAllFree();
-        emitFallback(build, data, LOP_FORGPREP, uintOp(inst.a));
+        emitFallback(regs, build, data, LOP_FORGPREP, uintOp(inst.a));
+        jumpOrFallthrough(blockOp(inst.c), next);
        break;
-    default:
-        LUAU_ASSERT(!"Not supported yet");
+
+    // Pseudo instructions
+    case IrCmd::NOP:
+    case IrCmd::SUBSTITUTE:
+        LUAU_ASSERT(!"Pseudo instructions should not be lowered");
        break;
    }

--- a/CodeGen/src/IrRegAllocA64.cpp
+++ b/CodeGen/src/IrRegAllocA64.cpp
@ -1,9 +1,7 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "IrRegAllocA64.h"

-#ifdef _MSC_VER
-#include <intrin.h>
-#endif
+#include "BitUtils.h"

 namespace Luau
 {
@ -12,19 +10,6 @@ namespace CodeGen
 namespace A64
 {

-inline int setBit(uint32_t n)
-{
-    LUAU_ASSERT(n);
-
-#ifdef _MSC_VER
-    unsigned long rl;
-    _BitScanReverse(&rl, n);
-    return int(rl);
-#else
-    return 31 - __builtin_clz(n);
-#endif
-}
-
 IrRegAllocA64::IrRegAllocA64(IrFunction& function, std::initializer_list<std::pair<RegisterA64, RegisterA64>> regs)
    : function(function)
 {
@ -52,7 +37,7 @@ RegisterA64 IrRegAllocA64::allocReg(KindA64 kind)
        return noreg;
    }

-    int index = setBit(set.free);
+    int index = 31 - countlz(set.free);
    set.free &= ~(1u << index);

    return RegisterA64{kind, uint8_t(index)};
@ -68,7 +53,7 @@ RegisterA64 IrRegAllocA64::allocTemp(KindA64 kind)
        return noreg;
    }

-    int index = setBit(set.free);
+    int index = 31 - countlz(set.free);

    set.free &= ~(1u << index);
    set.temp |= 1u << index;
--- a/CodeGen/src/IrRegAllocX64.cpp
+++ b/CodeGen/src/IrRegAllocX64.cpp
@ -1,6 +1,8 @@
 // This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
 #include "Luau/IrRegAllocX64.h"

+#include "Luau/IrUtils.h"
+
 #include "EmitCommonX64.h"

 namespace Luau
@ -12,11 +14,6 @@ namespace X64

 static const RegisterX64 kGprAllocOrder[] = {rax, rdx, rcx, rbx, rsi, rdi, r8, r9, r10, r11};

-static bool isFullTvalueOperand(IrCmd cmd)
-{
-    return cmd == IrCmd::LOAD_TVALUE || cmd == IrCmd::LOAD_NODE_VALUE_TV;
-}
-
 IrRegAllocX64::IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function)
    : build(build)
    , function(function)
@ -27,50 +24,43 @@ IrRegAllocX64::IrRegAllocX64(AssemblyBuilderX64& build, IrFunction& function)
    xmmInstUsers.fill(kInvalidInstIdx);
 }

-RegisterX64 IrRegAllocX64::allocGprReg(SizeX64 preferredSize, uint32_t instIdx)
+RegisterX64 IrRegAllocX64::allocReg(SizeX64 size, uint32_t instIdx)
 {
-    LUAU_ASSERT(
-        preferredSize == SizeX64::byte || preferredSize == SizeX64::word || preferredSize == SizeX64::dword || preferredSize == SizeX64::qword);
-
-    for (RegisterX64 reg : kGprAllocOrder)
+    if (size == SizeX64::xmmword)
    {
-        if (freeGprMap[reg.index])
+        for (size_t i = 0; i < freeXmmMap.size(); ++i)
        {
-            freeGprMap[reg.index] = false;
-            gprInstUsers[reg.index] = instIdx;
-            return RegisterX64{preferredSize, reg.index};
+            if (freeXmmMap[i])
+            {
+                freeXmmMap[i] = false;
+                xmmInstUsers[i] = instIdx;
+                return RegisterX64{size, uint8_t(i)};
+            }
        }
    }
-
-    // If possible, spill the value with the furthest next use
-    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(gprInstUsers); furthestUseTarget != kInvalidInstIdx)
-        return takeReg(function.instructions[furthestUseTarget].regX64, instIdx);
-
-    LUAU_ASSERT(!"Out of GPR registers to allocate");
-    return noreg;
-}
-
-RegisterX64 IrRegAllocX64::allocXmmReg(uint32_t instIdx)
-{
-    for (size_t i = 0; i < freeXmmMap.size(); ++i)
+    else
    {
-        if (freeXmmMap[i])
+        for (RegisterX64 reg : kGprAllocOrder)
        {
-            freeXmmMap[i] = false;
-            xmmInstUsers[i] = instIdx;
-            return RegisterX64{SizeX64::xmmword, uint8_t(i)};
+            if (freeGprMap[reg.index])
+            {
+                freeGprMap[reg.index] = false;
+                gprInstUsers[reg.index] = instIdx;
+                return RegisterX64{size, reg.index};
+            }
        }
    }

    // Out of registers, spill the value with the furthest next use
-    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(xmmInstUsers); furthestUseTarget != kInvalidInstIdx)
+    const std::array<uint32_t, 16>& regInstUsers = size == SizeX64::xmmword ? xmmInstUsers : gprInstUsers;
+    if (uint32_t furthestUseTarget = findInstructionWithFurthestNextUse(regInstUsers); furthestUseTarget != kInvalidInstIdx)
        return takeReg(function.instructions[furthestUseTarget].regX64, instIdx);

-    LUAU_ASSERT(!"Out of XMM registers to allocate");
+    LUAU_ASSERT(!"Out of registers to allocate");
    return noreg;
 }

-RegisterX64 IrRegAllocX64::allocGprRegOrReuse(SizeX64 preferredSize, uint32_t instIdx, std::initializer_list<IrOp> oprefs)
+RegisterX64 IrRegAllocX64::allocRegOrReuse(SizeX64 size, uint32_t instIdx, std::initializer_list<IrOp> oprefs)
 {
    for (IrOp op : oprefs)
    {
@ -81,39 +71,24 @@ RegisterX64 IrRegAllocX64::allocGprRegOrReuse(SizeX64 preferredSize, uint32_t in

        if (source.lastUse == instIdx && !source.reusedReg && !source.spilled)
        {
-            LUAU_ASSERT(source.regX64.size != SizeX64::xmmword);
+            // Not comparing size directly because we only need matching register set
+            if ((size == SizeX64::xmmword) != (source.regX64.size == SizeX64::xmmword))
+                continue;
+
            LUAU_ASSERT(source.regX64 != noreg);

            source.reusedReg = true;
-            gprInstUsers[source.regX64.index] = instIdx;
-            return RegisterX64{preferredSize, source.regX64.index};
+
+            if (size == SizeX64::xmmword)
+                xmmInstUsers[source.regX64.index] = instIdx;
+            else
+                gprInstUsers[source.regX64.index] = instIdx;
+
+            return RegisterX64{size, source.regX64.index};
        }
    }

-    return allocGprReg(preferredSize, instIdx);
-}
-
-RegisterX64 IrRegAllocX64::allocXmmRegOrReuse(uint32_t instIdx, std::initializer_list<IrOp> oprefs)
-{
-    for (IrOp op : oprefs)
-    {
-        if (op.kind != IrOpKind::Inst)
-            continue;
-
-        IrInst& source = function.instructions[op.index];
-
-        if (source.lastUse == instIdx && !source.reusedReg && !source.spilled)
-        {
-            LUAU_ASSERT(source.regX64.size == SizeX64::xmmword);
-            LUAU_ASSERT(source.regX64 != noreg);
-
-            source.reusedReg = true;
-            xmmInstUsers[source.regX64.index] = instIdx;
-            return source.regX64;
-        }
-    }
-
-    return allocXmmReg(instIdx);
+    return allocReg(size, instIdx);
 }

 RegisterX64 IrRegAllocX64::takeReg(RegisterX64 reg, uint32_t instIdx)
@ -197,41 +172,34 @@ bool IrRegAllocX64::isLastUseReg(const IrInst& target, uint32_t instIdx) const

 void IrRegAllocX64::preserve(IrInst& inst)
 {
-    bool doubleSlot = isFullTvalueOperand(inst.cmd);
+    IrSpillX64 spill;
+    spill.instIdx = function.getInstIndex(inst);
+    spill.valueKind = getCmdValueKind(inst.cmd);
+    spill.spillId = nextSpillId++;
+    spill.originalLoc = inst.regX64;

-    // Find a free stack slot. Two consecutive slots might be required for 16 byte TValues, so '- 1' is used
-    for (unsigned i = 0; i < unsigned(usedSpillSlots.size() - 1); ++i)
+    // Loads from VmReg/VmConst don't have to be spilled, they can be restored from a register later
+    if (!hasRestoreOp(inst))
    {
-        if (usedSpillSlots.test(i))
-            continue;
+        unsigned i = findSpillStackSlot(spill.valueKind);

-        if (doubleSlot && usedSpillSlots.test(i + 1))
-        {
-            ++i; // No need to retest this double position
-            continue;
-        }
-
-        if (inst.regX64.size == SizeX64::xmmword && doubleSlot)
-        {
+        if (spill.valueKind == IrValueKind::Tvalue)
            build.vmovups(xmmword[sSpillArea + i * 8], inst.regX64);
-        }
-        else if (inst.regX64.size == SizeX64::xmmword)
-        {
+        else if (spill.valueKind == IrValueKind::Double)
            build.vmovsd(qword[sSpillArea + i * 8], inst.regX64);
-        }
+        else if (spill.valueKind == IrValueKind::Pointer)
+            build.mov(qword[sSpillArea + i * 8], inst.regX64);
+        else if (spill.valueKind == IrValueKind::Tag || spill.valueKind == IrValueKind::Int)
+            build.mov(dword[sSpillArea + i * 8], inst.regX64);
        else
-        {
-            OperandX64 location = addr[sSpillArea + i * 8];
-            location.memSize = inst.regX64.size; // Override memory access size
-            build.mov(location, inst.regX64);
-        }
+            LUAU_ASSERT(!"unsupported value kind");

        usedSpillSlots.set(i);

        if (i + 1 > maxUsedSlot)
            maxUsedSlot = i + 1;

-        if (doubleSlot)
+        if (spill.valueKind == IrValueKind::Tvalue)
        {
            usedSpillSlots.set(i + 1);

@ -239,22 +207,15 @@ void IrRegAllocX64::preserve(IrInst& inst)
                maxUsedSlot = i + 2;
        }

-        IrSpillX64 spill;
-        spill.instIdx = function.getInstIndex(inst);
-        spill.useDoubleSlot = doubleSlot;
        spill.stackSlot = uint8_t(i);
-        spill.originalLoc = inst.regX64;
-
-        spills.push_back(spill);
-
-        freeReg(inst.regX64);
-
-        inst.regX64 = noreg;
-        inst.spilled = true;
-        return;
    }

-    LUAU_ASSERT(!"nowhere to spill");
+    spills.push_back(spill);
+
+    freeReg(inst.regX64);
+
+    inst.regX64 = noreg;
+    inst.spilled = true;
 }

 void IrRegAllocX64::restore(IrInst& inst, bool intoOriginalLocation)
@ -267,35 +228,34 @@ void IrRegAllocX64::restore(IrInst& inst, bool intoOriginalLocation)

        if (spill.instIdx == instIdx)
        {
-            LUAU_ASSERT(spill.stackSlot != kNoStackSlot);
-            RegisterX64 reg;
+            RegisterX64 reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocReg(spill.originalLoc.size, instIdx);
+            OperandX64 restoreLocation = noreg;

-            if (spill.originalLoc.size == SizeX64::xmmword)
+            if (spill.stackSlot != kNoStackSlot)
            {
-                reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocXmmReg(instIdx);
+                restoreLocation = addr[sSpillArea + spill.stackSlot * 8];
+                restoreLocation.memSize = reg.size;

-                if (spill.useDoubleSlot)
-                    build.vmovups(reg, xmmword[sSpillArea + spill.stackSlot * 8]);
-                else
-                    build.vmovsd(reg, qword[sSpillArea + spill.stackSlot * 8]);
+                usedSpillSlots.set(spill.stackSlot, false);
+
+                if (spill.valueKind == IrValueKind::Tvalue)
+                    usedSpillSlots.set(spill.stackSlot + 1, false);
            }
            else
            {
-                reg = intoOriginalLocation ? takeReg(spill.originalLoc, instIdx) : allocGprReg(spill.originalLoc.size, instIdx);
-
-                OperandX64 location = addr[sSpillArea + spill.stackSlot * 8];
-                location.memSize = reg.size; // Override memory access size
-                build.mov(reg, location);
+                restoreLocation = getRestoreAddress(inst, getRestoreOp(inst));
            }

+            if (spill.valueKind == IrValueKind::Tvalue)
+                build.vmovups(reg, restoreLocation);
+            else if (spill.valueKind == IrValueKind::Double)
+                build.vmovsd(reg, restoreLocation);
+            else
+                build.mov(reg, restoreLocation);
+
            inst.regX64 = reg;
            inst.spilled = false;

-            usedSpillSlots.set(spill.stackSlot, false);
-
-            if (spill.useDoubleSlot)
-                usedSpillSlots.set(spill.stackSlot + 1, false);
-
            spills[i] = spills.back();
            spills.pop_back();
            return;
@ -334,6 +294,81 @@ bool IrRegAllocX64::shouldFreeGpr(RegisterX64 reg) const
    return false;
 }

+unsigned IrRegAllocX64::findSpillStackSlot(IrValueKind valueKind)
+{
+    // Find a free stack slot. Two consecutive slots might be required for 16 byte TValues, so '- 1' is used
+    for (unsigned i = 0; i < unsigned(usedSpillSlots.size() - 1); ++i)
+    {
+        if (usedSpillSlots.test(i))
+            continue;
+
+        if (valueKind == IrValueKind::Tvalue && usedSpillSlots.test(i + 1))
+        {
+            ++i; // No need to retest this double position
+            continue;
+        }
+
+        return i;
+    }
+
+    LUAU_ASSERT(!"nowhere to spill");
+    return ~0u;
+}
+
+IrOp IrRegAllocX64::getRestoreOp(const IrInst& inst) const
+{
+    switch (inst.cmd)
+    {
+    case IrCmd::LOAD_TAG:
+    case IrCmd::LOAD_POINTER:
+    case IrCmd::LOAD_DOUBLE:
+    case IrCmd::LOAD_INT:
+    case IrCmd::LOAD_TVALUE:
+    {
+        IrOp location = inst.a;
+
+        // Might have an alternative location
+        if (IrOp alternative = function.findRestoreOp(inst); alternative.kind != IrOpKind::None)
+            location = alternative;
+
+        if (location.kind == IrOpKind::VmReg || location.kind == IrOpKind::VmConst)
+            return location;
+
+        break;
+    }
+    default:
+        break;
+    }
+
+    return IrOp();
+}
+
+bool IrRegAllocX64::hasRestoreOp(const IrInst& inst) const
+{
+    return getRestoreOp(inst).kind != IrOpKind::None;
+}
+
+OperandX64 IrRegAllocX64::getRestoreAddress(const IrInst& inst, IrOp restoreOp)
+{
+    switch (inst.cmd)
+    {
+    case IrCmd::LOAD_TAG:
+        return restoreOp.kind == IrOpKind::VmReg ? luauRegTag(vmRegOp(restoreOp)) : luauConstantTag(vmConstOp(restoreOp));
+    case IrCmd::LOAD_POINTER:
+    case IrCmd::LOAD_DOUBLE:
+        return restoreOp.kind == IrOpKind::VmReg ? luauRegValue(vmRegOp(restoreOp)) : luauConstantValue(vmConstOp(restoreOp));
+    case IrCmd::LOAD_INT:
+        LUAU_ASSERT(restoreOp.kind == IrOpKind::VmReg);
+        return luauRegValueInt(vmRegOp(restoreOp));
+    case IrCmd::LOAD_TVALUE:
+        return restoreOp.kind == IrOpKind::VmReg ? luauReg(vmRegOp(restoreOp)) : luauConstant(vmConstOp(restoreOp));
+    default:
+        break;
+    }
+
+    return noreg;
+}
+
 uint32_t IrRegAllocX64::findInstructionWithFurthestNextUse(const std::array<uint32_t, 16>& regInstUsers) const
 {
    uint32_t furthestUseTarget = kInvalidInstIdx;
@ -411,11 +446,7 @@ ScopedRegX64::~ScopedRegX64()
 void ScopedRegX64::alloc(SizeX64 size)
 {
    LUAU_ASSERT(reg == noreg);
-
-    if (size == SizeX64::xmmword)
-        reg = owner.allocXmmReg(kInvalidInstIdx);
-    else
-        reg = owner.allocGprReg(size, kInvalidInstIdx);
+    reg = owner.allocReg(size, kInvalidInstIdx);
 }

 void ScopedRegX64::free()
@ -435,38 +466,36 @@ RegisterX64 ScopedRegX64::release()
 ScopedSpills::ScopedSpills(IrRegAllocX64& owner)
    : owner(owner)
 {
-    snapshot = owner.spills;
+    startSpillId = owner.nextSpillId;
 }

 ScopedSpills::~ScopedSpills()
 {
-    // Taking a copy of current spills because we are going to potentially restore them
-    std::vector<IrSpillX64> current = owner.spills;
+    unsigned endSpillId = owner.nextSpillId;

-    // Restore registers that were spilled inside scope protected by this object
-    for (IrSpillX64& curr : current)
+    for (size_t i = 0; i < owner.spills.size();)
    {
-        // If spill existed before current scope, it can be restored outside of it
-        if (!wasSpilledBefore(curr))
+        IrSpillX64& spill = owner.spills[i];
+
+        // Restoring spills inside this scope cannot create new spills
+        LUAU_ASSERT(spill.spillId < endSpillId);
+
+        // If spill was created inside current scope, it has to be restored
+        if (spill.spillId >= startSpillId)
        {
-            IrInst& inst = owner.function.instructions[curr.instIdx];
+            IrInst& inst = owner.function.instructions[spill.instIdx];

            owner.restore(inst, /*intoOriginalLocation*/ true);
+
+            // Spill restore removes the spill entry, so loop is repeated at the same 'i'
+        }
+        else
+        {
+            i++;
        }
    }
 }

-bool ScopedSpills::wasSpilledBefore(const IrSpillX64& spill) const
-{
-    for (const IrSpillX64& preexisting : snapshot)
-    {
-        if (spill.instIdx == preexisting.instIdx)
-            return true;
-    }
-
-    return false;
-}
-
 } // namespace X64
 } // namespace CodeGen
 } // namespace Luau
--- a/CodeGen/src/IrTranslateBuiltins.cpp
+++ b/CodeGen/src/IrTranslateBuiltins.cpp
@ -8,6 +8,8 @@

 // TODO: when nresults is less than our actual result count, we can skip computing/writing unused results

+static const int kMinMaxUnrolledParams = 5;
+
 namespace Luau
 {
 namespace CodeGen
@ -23,7 +25,7 @@ BuiltinImplResult translateBuiltinNumberToNumber(
        return {BuiltinImplType::None, -1};

    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));

    if (ra != arg)
        build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@ -40,7 +42,7 @@ BuiltinImplResult translateBuiltin2NumberToNumber(

    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
    build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(2), build.constInt(1));

    if (ra != arg)
        build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@ -56,12 +58,13 @@ BuiltinImplResult translateBuiltinNumberTo2Number(
        return {BuiltinImplType::None, -1};

    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(
+        IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(nresults == 1 ? 1 : 2));

    if (ra != arg)
        build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));

-    if (nresults > 1)
+    if (nresults != 1)
        build.inst(IrCmd::STORE_TAG, build.vmReg(ra + 1), build.constTag(LUA_TNUMBER));

    return {BuiltinImplType::UsesFallback, 2};
@ -125,12 +128,33 @@ BuiltinImplResult translateBuiltinMathLog(
    if (nparams < 1 || nresults > 1)
        return {BuiltinImplType::None, -1};

-    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
+    LuauBuiltinFunction fcId = bfid;
+    int fcParams = 1;

    if (nparams != 1)
-        build.loadAndCheckTag(args, LUA_TNUMBER, fallback);
+    {
+        if (args.kind != IrOpKind::VmConst)
+            return {BuiltinImplType::None, -1};

-    build.inst(IrCmd::FASTCALL, build.constUint(bfid), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+        LUAU_ASSERT(build.function.proto);
+        TValue protok = build.function.proto->k[vmConstOp(args)];
+
+        if (protok.tt != LUA_TNUMBER)
+            return {BuiltinImplType::None, -1};
+
+        // TODO: IR builtin lowering assumes that the only valid 2-argument call is log2; ideally, we use a less hacky way to indicate that
+        if (protok.value.n == 2.0)
+            fcParams = 2;
+        else if (protok.value.n == 10.0)
+            fcId = LBF_MATH_LOG10;
+        else
+            // TODO: We can precompute log(args) and divide by it, but that requires extra LOAD/STORE so for now just fall back as this is rare
+            return {BuiltinImplType::None, -1};
+    }
+
+    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
+
+    build.inst(IrCmd::FASTCALL, build.constUint(fcId), build.vmReg(ra), build.vmReg(arg), args, build.constInt(fcParams), build.constInt(1));

    if (ra != arg)
        build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TNUMBER));
@ -140,17 +164,26 @@ BuiltinImplResult translateBuiltinMathLog(

 BuiltinImplResult translateBuiltinMathMin(IrBuilder& build, int nparams, int ra, int arg, IrOp args, int nresults, IrOp fallback)
 {
-    // TODO: this can be extended for other number of arguments
-    if (nparams != 2 || nresults > 1)
+    if (nparams < 2 || nparams > kMinMaxUnrolledParams || nresults > 1)
        return {BuiltinImplType::None, -1};

    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
    build.loadAndCheckTag(args, LUA_TNUMBER, fallback);

+    for (int i = 3; i <= nparams; ++i)
+        build.loadAndCheckTag(build.vmReg(vmRegOp(args) + (i - 2)), LUA_TNUMBER, fallback);
+
    IrOp varg1 = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(arg));
    IrOp varg2 = build.inst(IrCmd::LOAD_DOUBLE, args);

    IrOp res = build.inst(IrCmd::MIN_NUM, varg2, varg1); // Swapped arguments are required for consistency with VM builtins
+
+    for (int i = 3; i <= nparams; ++i)
+    {
+        IrOp arg = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(vmRegOp(args) + (i - 2)));
+        res = build.inst(IrCmd::MIN_NUM, arg, res);
+    }
+
    build.inst(IrCmd::STORE_DOUBLE, build.vmReg(ra), res);

    if (ra != arg)
@ -161,17 +194,26 @@ BuiltinImplResult translateBuiltinMathMin(IrBuilder& build, int nparams, int ra,

 BuiltinImplResult translateBuiltinMathMax(IrBuilder& build, int nparams, int ra, int arg, IrOp args, int nresults, IrOp fallback)
 {
-    // TODO: this can be extended for other number of arguments
-    if (nparams != 2 || nresults > 1)
+    if (nparams < 2 || nparams > kMinMaxUnrolledParams || nresults > 1)
        return {BuiltinImplType::None, -1};

    build.loadAndCheckTag(build.vmReg(arg), LUA_TNUMBER, fallback);
    build.loadAndCheckTag(args, LUA_TNUMBER, fallback);

+    for (int i = 3; i <= nparams; ++i)
+        build.loadAndCheckTag(build.vmReg(vmRegOp(args) + (i - 2)), LUA_TNUMBER, fallback);
+
    IrOp varg1 = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(arg));
    IrOp varg2 = build.inst(IrCmd::LOAD_DOUBLE, args);

    IrOp res = build.inst(IrCmd::MAX_NUM, varg2, varg1); // Swapped arguments are required for consistency with VM builtins
+
+    for (int i = 3; i <= nparams; ++i)
+    {
+        IrOp arg = build.inst(IrCmd::LOAD_DOUBLE, build.vmReg(vmRegOp(args) + (i - 2)));
+        res = build.inst(IrCmd::MAX_NUM, arg, res);
+    }
+
    build.inst(IrCmd::STORE_DOUBLE, build.vmReg(ra), res);

    if (ra != arg)
@ -254,8 +296,7 @@ BuiltinImplResult translateBuiltinType(IrBuilder& build, int nparams, int ra, in
    if (nparams < 1 || nresults > 1)
        return {BuiltinImplType::None, -1};

-    build.inst(
-        IrCmd::FASTCALL, build.constUint(LBF_TYPE), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(LBF_TYPE), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));

    build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TSTRING));

@ -267,8 +308,7 @@ BuiltinImplResult translateBuiltinTypeof(IrBuilder& build, int nparams, int ra,
    if (nparams < 1 || nresults > 1)
        return {BuiltinImplType::None, -1};

-    build.inst(
-        IrCmd::FASTCALL, build.constUint(LBF_TYPEOF), build.vmReg(ra), build.vmReg(arg), args, build.constInt(nparams), build.constInt(nresults));
+    build.inst(IrCmd::FASTCALL, build.constUint(LBF_TYPEOF), build.vmReg(ra), build.vmReg(arg), args, build.constInt(1), build.constInt(1));

    build.inst(IrCmd::STORE_TAG, build.vmReg(ra), build.constTag(LUA_TSTRING));

--- a/CodeGen/src/IrUtils.cpp
+++ b/CodeGen/src/IrUtils.cpp
@ -284,7 +284,7 @@ void replace(IrFunction& function, IrBlock& block, uint32_t instIdx, IrInst repl
    block.useCount--;
 }

-void substitute(IrFunction& function, IrInst& inst, IrOp replacement)
+void substitute(IrFunction& function, IrInst& inst, IrOp replacement, IrOp location)
 {
    LUAU_ASSERT(!isBlockTerminator(inst.cmd));

@ -298,7 +298,7 @@ void substitute(IrFunction& function, IrInst& inst, IrOp replacement)
    removeUse(function, inst.f);

    inst.a = replacement;
-    inst.b = {};
+    inst.b = location;
    inst.c = {};
    inst.d = {};
    inst.e = {};
--- a/CodeGen/src/NativeState.cpp
+++ b/CodeGen/src/NativeState.cpp
@ -16,7 +16,7 @@
 #include <math.h>
 #include <string.h>

-#define CODEGEN_SET_FALLBACK(op, flags) data.context.fallback[op] = {execute_##op, flags}
+#define CODEGEN_SET_FALLBACK(op) data.context.fallback[op] = {execute_##op}

 namespace Luau
 {
@ -36,20 +36,21 @@ NativeState::~NativeState() = default;
 void initFallbackTable(NativeState& data)
 {
    // When fallback is completely removed, remove it from includeInsts list in lvmexecute_split.py
-    CODEGEN_SET_FALLBACK(LOP_NEWCLOSURE, 0);
-    CODEGEN_SET_FALLBACK(LOP_NAMECALL, 0);
-    CODEGEN_SET_FALLBACK(LOP_FORGPREP, kFallbackUpdatePc);
-    CODEGEN_SET_FALLBACK(LOP_GETVARARGS, 0);
-    CODEGEN_SET_FALLBACK(LOP_DUPCLOSURE, 0);
-    CODEGEN_SET_FALLBACK(LOP_PREPVARARGS, 0);
-    CODEGEN_SET_FALLBACK(LOP_BREAK, 0);
+    CODEGEN_SET_FALLBACK(LOP_NEWCLOSURE);
+    CODEGEN_SET_FALLBACK(LOP_NAMECALL);
+    CODEGEN_SET_FALLBACK(LOP_FORGPREP);
+    CODEGEN_SET_FALLBACK(LOP_GETVARARGS);
+    CODEGEN_SET_FALLBACK(LOP_DUPCLOSURE);
+    CODEGEN_SET_FALLBACK(LOP_PREPVARARGS);
+    CODEGEN_SET_FALLBACK(LOP_BREAK);
+    CODEGEN_SET_FALLBACK(LOP_SETLIST);

    // Fallbacks that are called from partial implementation of an instruction
    // TODO: these fallbacks should be replaced with special functions that exclude the (redundantly executed) fast path from the fallback
-    CODEGEN_SET_FALLBACK(LOP_GETGLOBAL, 0);
-    CODEGEN_SET_FALLBACK(LOP_SETGLOBAL, 0);
-    CODEGEN_SET_FALLBACK(LOP_GETTABLEKS, 0);
-    CODEGEN_SET_FALLBACK(LOP_SETTABLEKS, 0);
+    CODEGEN_SET_FALLBACK(LOP_GETGLOBAL);
+    CODEGEN_SET_FALLBACK(LOP_SETGLOBAL);
+    CODEGEN_SET_FALLBACK(LOP_GETTABLEKS);
+    CODEGEN_SET_FALLBACK(LOP_SETTABLEKS);
 }

 void initHelperFunctions(NativeState& data)
@ -105,6 +106,7 @@ void initHelperFunctions(NativeState& data)
    data.context.libm_tan = tan;
    data.context.libm_tanh = tanh;

+    data.context.forgLoopTableIter = forgLoopTableIter;
    data.context.forgLoopNodeIter = forgLoopNodeIter;
    data.context.forgLoopNonTableFallback = forgLoopNonTableFallback;
    data.context.forgPrepXnextFallback = forgPrepXnextFallback;
--- a/CodeGen/src/NativeState.h
+++ b/CodeGen/src/NativeState.h
@ -23,15 +23,7 @@ namespace CodeGen

 class UnwindBuilder;

-using FallbackFn = const Instruction*(lua_State* L, const Instruction* pc, StkId base, TValue* k);
-
-constexpr uint8_t kFallbackUpdatePc = 1 << 0;
-
-struct NativeFallback
-{
-    FallbackFn* fallback;
-    uint8_t flags;
-};
+using FallbackFn = const Instruction* (*)(lua_State* L, const Instruction* pc, StkId base, TValue* k);

 struct NativeProto
 {
@ -96,6 +88,7 @@ struct NativeContext
    double (*libm_modf)(double, double*) = nullptr;

    // Helper functions
+    bool (*forgLoopTableIter)(lua_State* L, Table* h, int index, TValue* ra) = nullptr;
    bool (*forgLoopNodeIter)(lua_State* L, Table* h, int index, TValue* ra) = nullptr;
    bool (*forgLoopNonTableFallback)(lua_State* L, int insnA, int aux) = nullptr;
    void (*forgPrepXnextFallback)(lua_State* L, TValue* ra, int pc) = nullptr;
@ -106,7 +99,7 @@ struct NativeContext
    Closure* (*returnFallback)(lua_State* L, StkId ra, int n) = nullptr;

    // Opcode fallbacks, implemented in C
-    NativeFallback fallback[LOP__COUNT] = {};
+    FallbackFn fallback[LOP__COUNT] = {};

    // Fast call methods, implemented in C
    luau_FastFunction luauF_table[256] = {};
--- a/CodeGen/src/OptimizeConstProp.cpp
+++ b/CodeGen/src/OptimizeConstProp.cpp
@ -502,6 +502,8 @@ static void constPropInInst(ConstPropState& state, IrBuilder& build, IrFunction&
            }
        }
        break;
+
+        // TODO: FASTCALL is more restrictive than INVOKE_FASTCALL; we should either determine the exact semantics, or rework it
    case IrCmd::FASTCALL:
    case IrCmd::INVOKE_FASTCALL:
        handleBuiltinEffects(state, LuauBuiltinFunction(function.uintOp(inst.a)), vmRegOp(inst.b), function.intOp(inst.f));
--- a/CodeGen/src/UnwindBuilderDwarf2.cpp
+++ b/CodeGen/src/UnwindBuilderDwarf2.cpp
@ -132,7 +132,7 @@ size_t UnwindBuilderDwarf2::getBeginOffset() const
    return beginOffset;
 }

-void UnwindBuilderDwarf2::start()
+void UnwindBuilderDwarf2::startInfo()
 {
    uint8_t* cieLength = pos;
    pos = writeu32(pos, 0); // Length (to be filled later)
@ -149,13 +149,23 @@ void UnwindBuilderDwarf2::start()
    // Optional CIE augmentation section (not present)

    // Call frame instructions (common for all FDEs, of which we have 1)
-    stackOffset = 8; // Return address was pushed by calling the function
-
-    pos = defineCfaExpression(pos, DW_REG_RSP, stackOffset); // Define CFA to be the rsp + 8
+    pos = defineCfaExpression(pos, DW_REG_RSP, 8);           // Define CFA to be the rsp + 8
    pos = defineSavedRegisterLocation(pos, DW_REG_RA, 8);    // Define return address register (RA) to be located at CFA - 8

    pos = alignPosition(cieLength, pos);
    writeu32(cieLength, unsigned(pos - cieLength - 4)); // Length field itself is excluded from length
+}
+
+void UnwindBuilderDwarf2::startFunction()
+{
+    // End offset is filled in later and everything gets adjusted at the end
+    UnwindFunctionDwarf2 func;
+    func.beginOffset = 0;
+    func.endOffset = 0;
+    func.fdeEntryStartPos = uint32_t(pos - rawData);
+    unwindFunctions.push_back(func);
+
+    stackOffset = 8; // Return address was pushed by calling the function

    fdeEntryStart = pos;                          // Will be written at the end
    pos = writeu32(pos, 0);                       // Length (to be filled later)
@ -198,14 +208,20 @@ void UnwindBuilderDwarf2::setupFrameReg(X64::RegisterX64 reg, int espOffset)
    // Cfa is based on rsp, so no additonal commands are required
 }

-void UnwindBuilderDwarf2::finish()
+void UnwindBuilderDwarf2::finishFunction(uint32_t beginOffset, uint32_t endOffset)
 {
+    unwindFunctions.back().beginOffset = beginOffset;
+    unwindFunctions.back().endOffset = endOffset;
+
    LUAU_ASSERT(stackOffset % 16 == 0 && "stack has to be aligned to 16 bytes after prologue");
    LUAU_ASSERT(fdeEntryStart != nullptr);

    pos = alignPosition(fdeEntryStart, pos);
    writeu32(fdeEntryStart, unsigned(pos - fdeEntryStart - 4)); // Length field itself is excluded from length
+}

+void UnwindBuilderDwarf2::finishInfo()
+{
    // Terminate section
    pos = writeu32(pos, 0);

@ -217,15 +233,26 @@ size_t UnwindBuilderDwarf2::getSize() const
    return size_t(pos - rawData);
 }

-void UnwindBuilderDwarf2::finalize(char* target, void* funcAddress, size_t funcSize) const
+size_t UnwindBuilderDwarf2::getFunctionCount() const
+{
+    return unwindFunctions.size();
+}
+
+void UnwindBuilderDwarf2::finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const
 {
    memcpy(target, rawData, getSize());

-    LUAU_ASSERT(fdeEntryStart != nullptr);
-    unsigned fdeEntryStartPos = unsigned(fdeEntryStart - rawData);
+    for (const UnwindFunctionDwarf2& func : unwindFunctions)
+    {
+        uint8_t* fdeEntryStart = (uint8_t*)target + func.fdeEntryStartPos;

-    writeu64((uint8_t*)target + fdeEntryStartPos + kFdeInitialLocationOffset, uintptr_t(funcAddress));
-    writeu64((uint8_t*)target + fdeEntryStartPos + kFdeAddressRangeOffset, funcSize);
+        writeu64(fdeEntryStart + kFdeInitialLocationOffset, uintptr_t(funcAddress) + offset + func.beginOffset);
+
+        if (func.endOffset == kFullBlockFuncton)
+            writeu64(fdeEntryStart + kFdeAddressRangeOffset, funcSize - offset);
+        else
+            writeu64(fdeEntryStart + kFdeAddressRangeOffset, func.endOffset - func.beginOffset);
+    }
 }

 } // namespace CodeGen
--- a/CodeGen/src/UnwindBuilderWin.cpp
+++ b/CodeGen/src/UnwindBuilderWin.cpp
@ -21,17 +21,6 @@ namespace Luau
 namespace CodeGen
 {

-// This struct matches the layout of UNWIND_INFO from ehdata.h
-struct UnwindInfoWin
-{
-    uint8_t version : 3;
-    uint8_t flags : 5;
-    uint8_t prologsize;
-    uint8_t unwindcodecount;
-    uint8_t framereg : 4;
-    uint8_t frameregoff : 4;
-};
-
 void UnwindBuilderWin::setBeginOffset(size_t beginOffset)
 {
    this->beginOffset = beginOffset;
@ -42,11 +31,28 @@ size_t UnwindBuilderWin::getBeginOffset() const
    return beginOffset;
 }

-void UnwindBuilderWin::start()
-{
-    stackOffset = 8; // Return address was pushed by calling the function
+void UnwindBuilderWin::startInfo() {}

+void UnwindBuilderWin::startFunction()
+{
+    // End offset is filled in later and everything gets adjusted at the end
+    UnwindFunctionWin func;
+    func.beginOffset = 0;
+    func.endOffset = 0;
+    func.unwindInfoOffset = uint32_t(rawDataPos - rawData);
+    unwindFunctions.push_back(func);
+
+    unwindCodes.clear();
    unwindCodes.reserve(16);
+
+    prologSize = 0;
+
+    // rax has register index 0, which in Windows unwind info means that frame register is not used
+    frameReg = X64::rax;
+    frameRegOffset = 0;
+
+    // Return address was pushed by calling the function
+    stackOffset = 8;
 }

 void UnwindBuilderWin::spill(int espOffset, X64::RegisterX64 reg)
@ -85,49 +91,89 @@ void UnwindBuilderWin::setupFrameReg(X64::RegisterX64 reg, int espOffset)
    unwindCodes.push_back({prologSize, UWOP_SET_FPREG, frameRegOffset});
 }

-void UnwindBuilderWin::finish()
+void UnwindBuilderWin::finishFunction(uint32_t beginOffset, uint32_t endOffset)
 {
+    unwindFunctions.back().beginOffset = beginOffset;
+    unwindFunctions.back().endOffset = endOffset;
+
    // Windows unwind code count is stored in uint8_t, so we can't have more
    LUAU_ASSERT(unwindCodes.size() < 256);

    LUAU_ASSERT(stackOffset % 16 == 0 && "stack has to be aligned to 16 bytes after prologue");

-    size_t codeArraySize = unwindCodes.size();
-    codeArraySize = (codeArraySize + 1) & ~1; // Size has to be even, but unwind code count doesn't have to
-
-    infoSize = sizeof(UnwindInfoWin) + sizeof(UnwindCodeWin) * codeArraySize;
-}
-
-size_t UnwindBuilderWin::getSize() const
-{
-    return infoSize;
-}
-
-void UnwindBuilderWin::finalize(char* target, void* funcAddress, size_t funcSize) const
-{
    UnwindInfoWin info;
    info.version = 1;
    info.flags = 0; // No EH
    info.prologsize = prologSize;
    info.unwindcodecount = uint8_t(unwindCodes.size());
+
+    LUAU_ASSERT(frameReg.index < 16);
    info.framereg = frameReg.index;
+
+    LUAU_ASSERT(frameRegOffset < 16);
    info.frameregoff = frameRegOffset;

-    memcpy(target, &info, sizeof(info));
-    target += sizeof(UnwindInfoWin);
+    LUAU_ASSERT(rawDataPos + sizeof(info) <= rawData + kRawDataLimit);
+    memcpy(rawDataPos, &info, sizeof(info));
+    rawDataPos += sizeof(info);

    if (!unwindCodes.empty())
    {
        // Copy unwind codes in reverse order
        // Some unwind codes take up two array slots, but we don't use those atm
-        char* pos = target + sizeof(UnwindCodeWin) * (unwindCodes.size() - 1);
+        uint8_t* unwindCodePos = rawDataPos + sizeof(UnwindCodeWin) * (unwindCodes.size() - 1);
+        LUAU_ASSERT(unwindCodePos <= rawData + kRawDataLimit);

        for (size_t i = 0; i < unwindCodes.size(); i++)
        {
-            memcpy(pos, &unwindCodes[i], sizeof(UnwindCodeWin));
-            pos -= sizeof(UnwindCodeWin);
+            memcpy(unwindCodePos, &unwindCodes[i], sizeof(UnwindCodeWin));
+            unwindCodePos -= sizeof(UnwindCodeWin);
        }
    }
+
+    rawDataPos += sizeof(UnwindCodeWin) * unwindCodes.size();
+
+    // Size has to be even, but unwind code count doesn't have to
+    if (unwindCodes.size() % 2 != 0)
+        rawDataPos += sizeof(UnwindCodeWin);
+
+    LUAU_ASSERT(rawDataPos <= rawData + kRawDataLimit);
+}
+
+void UnwindBuilderWin::finishInfo() {}
+
+size_t UnwindBuilderWin::getSize() const
+{
+    return sizeof(UnwindFunctionWin) * unwindFunctions.size() + size_t(rawDataPos - rawData);
+}
+
+size_t UnwindBuilderWin::getFunctionCount() const
+{
+    return unwindFunctions.size();
+}
+
+void UnwindBuilderWin::finalize(char* target, size_t offset, void* funcAddress, size_t funcSize) const
+{
+    // Copy adjusted function information
+    for (UnwindFunctionWin func : unwindFunctions)
+    {
+        // Code will start after the unwind info
+        func.beginOffset += uint32_t(offset);
+
+        // Whole block is a part of a 'single function'
+        if (func.endOffset == kFullBlockFuncton)
+            func.endOffset = uint32_t(funcSize);
+        else
+            func.endOffset += uint32_t(offset);
+
+        // Unwind data is placed right after the RUNTIME_FUNCTION data
+        func.unwindInfoOffset += uint32_t(sizeof(UnwindFunctionWin) * unwindFunctions.size());
+        memcpy(target, &func, sizeof(func));
+        target += sizeof(func);
+    }
+
+    // Copy unwind codes
+    memcpy(target, rawData, size_t(rawDataPos - rawData));
 }

 } // namespace CodeGen
--- a/Sources.cmake
+++ b/Sources.cmake
@ -89,9 +89,7 @@ target_sources(Luau.CodeGen PRIVATE
    CodeGen/src/CodeGenA64.cpp
    CodeGen/src/CodeGenX64.cpp
    CodeGen/src/EmitBuiltinsX64.cpp
-    CodeGen/src/EmitCommonA64.cpp
    CodeGen/src/EmitCommonX64.cpp
-    CodeGen/src/EmitInstructionA64.cpp
    CodeGen/src/EmitInstructionX64.cpp
    CodeGen/src/Fallbacks.cpp
    CodeGen/src/IrAnalysis.cpp
@ -111,6 +109,7 @@ target_sources(Luau.CodeGen PRIVATE
    CodeGen/src/UnwindBuilderDwarf2.cpp
    CodeGen/src/UnwindBuilderWin.cpp

+    CodeGen/src/BitUtils.h
    CodeGen/src/ByteUtils.h
    CodeGen/src/CustomExecUtils.h
    CodeGen/src/CodeGenUtils.h
@ -120,7 +119,6 @@ target_sources(Luau.CodeGen PRIVATE
    CodeGen/src/EmitCommon.h
    CodeGen/src/EmitCommonA64.h
    CodeGen/src/EmitCommonX64.h
-    CodeGen/src/EmitInstructionA64.h
    CodeGen/src/EmitInstructionX64.h
    CodeGen/src/Fallbacks.h
    CodeGen/src/FallbacksProlog.h
--- a/VM/src/lapi.cpp
+++ b/VM/src/lapi.cpp
@ -538,6 +538,8 @@ const void* lua_topointer(lua_State* L, int idx)
    StkId o = index2addr(L, idx);
    switch (ttype(o))
    {
+    case LUA_TSTRING:
+        return tsvalue(o);
    case LUA_TTABLE:
        return hvalue(o);
    case LUA_TFUNCTION:
--- a/VM/src/ltable.cpp
+++ b/VM/src/ltable.cpp
@ -33,8 +33,6 @@

 #include <string.h>

-LUAU_FASTFLAGVARIABLE(LuauArrBoundResizeFix, false)
-
 // max size of both array and hash part is 2^MAXBITS
 #define MAXBITS 26
 #define MAXSIZE (1 << MAXBITS)
@ -466,30 +464,22 @@ static void rehash(lua_State* L, Table* t, const TValue* ek)
    int na = computesizes(nums, &nasize);
    int nh = totaluse - na;

-    if (FFlag::LuauArrBoundResizeFix)
+    // enforce the boundary invariant; for performance, only do hash lookups if we must
+    int nadjusted = adjustasize(t, nasize, ek);
+
+    // count how many extra elements belong to array part instead of hash part
+    int aextra = nadjusted - nasize;
+
+    if (aextra != 0)
    {
-        // enforce the boundary invariant; for performance, only do hash lookups if we must
-        int nadjusted = adjustasize(t, nasize, ek);
+        // we no longer need to store those extra array elements in hash part
+        nh -= aextra;

-        // count how many extra elements belong to array part instead of hash part
-        int aextra = nadjusted - nasize;
+        // because hash nodes are twice as large as array nodes, the memory we saved for hash parts can be used by array part
+        // this follows the general sparse array part optimization where array is allocated when 50% occupation is reached
+        nasize = nadjusted + aextra;

-        if (aextra != 0)
-        {
-            // we no longer need to store those extra array elements in hash part
-            nh -= aextra;
-
-            // because hash nodes are twice as large as array nodes, the memory we saved for hash parts can be used by array part
-            // this follows the general sparse array part optimization where array is allocated when 50% occupation is reached
-            nasize = nadjusted + aextra;
-
-            // since the size was changed, it's again important to enforce the boundary invariant at the new size
-            nasize = adjustasize(t, nasize, ek);
-        }
-    }
-    else
-    {
-        // enforce the boundary invariant; for performance, only do hash lookups if we must
+        // since the size was changed, it's again important to enforce the boundary invariant at the new size
        nasize = adjustasize(t, nasize, ek);
    }

--- a/fuzz/linter.cpp
+++ b/fuzz/linter.cpp
@ -21,7 +21,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* Data, size_t Size)
    static Luau::NullFileResolver fileResolver;
    static Luau::NullConfigResolver configResolver;
    static Luau::Frontend frontend{&fileResolver, &configResolver};
-    static int once = (Luau::registerBuiltinGlobals(frontend), 1);
+    static int once = (Luau::registerBuiltinGlobals(frontend, frontend.globals, false), 1);
    (void)once;
    static int once2 = (Luau::freeze(frontend.globals.globalTypes), 1);
    (void)once2;
--- a/fuzz/proto.cpp
+++ b/fuzz/proto.cpp
@ -97,12 +97,12 @@ lua_State* createGlobalState()
    return L;
 }

-int registerTypes(Luau::TypeChecker& typeChecker, Luau::GlobalTypes& globals)
+int registerTypes(Luau::Frontend& frontend, Luau::GlobalTypes& globals, bool forAutocomplete)
 {
    using namespace Luau;
    using std::nullopt;

-    Luau::registerBuiltinGlobals(typeChecker, globals);
+    Luau::registerBuiltinGlobals(frontend, globals, forAutocomplete);

    TypeArena& arena = globals.globalTypes;
    BuiltinTypes& builtinTypes = *globals.builtinTypes;
@ -147,10 +147,10 @@ int registerTypes(Luau::TypeChecker& typeChecker, Luau::GlobalTypes& globals)

 static void setupFrontend(Luau::Frontend& frontend)
 {
-    registerTypes(frontend.typeChecker, frontend.globals);
+    registerTypes(frontend, frontend.globals, false);
    Luau::freeze(frontend.globals.globalTypes);

-    registerTypes(frontend.typeCheckerForAutocomplete, frontend.globalsForAutocomplete);
+    registerTypes(frontend, frontend.globalsForAutocomplete, true);
    Luau::freeze(frontend.globalsForAutocomplete.globalTypes);

    frontend.iceHandler.onInternalError = [](const char* error) {
--- a/fuzz/typeck.cpp
+++ b/fuzz/typeck.cpp
@ -26,7 +26,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* Data, size_t Size)
    static Luau::NullFileResolver fileResolver;
    static Luau::NullConfigResolver configResolver;
    static Luau::Frontend frontend{&fileResolver, &configResolver};
-    static int once = (Luau::registerBuiltinGlobals(frontend), 1);
+    static int once = (Luau::registerBuiltinGlobals(frontend, frontend.globals, false), 1);
    (void)once;
    static int once2 = (Luau::freeze(frontend.globals.globalTypes), 1);
    (void)once2;
--- a/tests/AssemblyBuilderA64.test.cpp
+++ b/tests/AssemblyBuilderA64.test.cpp
@ -86,6 +86,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
    SINGLE_COMPARE(add(x0, x1, x2, 7), 0x8B021C20);
    SINGLE_COMPARE(sub(x0, x1, x2), 0xCB020020);
    SINGLE_COMPARE(and_(x0, x1, x2), 0x8A020020);
+    SINGLE_COMPARE(bic(x0, x1, x2), 0x8A220020);
    SINGLE_COMPARE(orr(x0, x1, x2), 0xAA020020);
    SINGLE_COMPARE(eor(x0, x1, x2), 0xCA020020);
    SINGLE_COMPARE(lsl(x0, x1, x2), 0x9AC22020);
@ -94,6 +95,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
    SINGLE_COMPARE(asr(x0, x1, x2), 0x9AC22820);
    SINGLE_COMPARE(ror(x0, x1, x2), 0x9AC22C20);
    SINGLE_COMPARE(cmp(x0, x1), 0xEB01001F);
+    SINGLE_COMPARE(tst(x0, x1), 0xEA01001F);

    // reg, imm
    SINGLE_COMPARE(add(x3, x7, 78), 0x910138E3);
@ -102,6 +104,24 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
    SINGLE_COMPARE(cmp(w0, 42), 0x7100A81F);
 }

+TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryImm")
+{
+    // instructions
+    SINGLE_COMPARE(and_(w1, w2, 1), 0x12000041);
+    SINGLE_COMPARE(orr(w1, w2, 1), 0x32000041);
+    SINGLE_COMPARE(eor(w1, w2, 1), 0x52000041);
+    SINGLE_COMPARE(tst(w1, 1), 0x7200003f);
+
+    // various mask forms
+    SINGLE_COMPARE(and_(w0, w0, 1), 0x12000000);
+    SINGLE_COMPARE(and_(w0, w0, 3), 0x12000400);
+    SINGLE_COMPARE(and_(w0, w0, 7), 0x12000800);
+    SINGLE_COMPARE(and_(w0, w0, 2147483647), 0x12007800);
+    SINGLE_COMPARE(and_(w0, w0, 6), 0x121F0400);
+    SINGLE_COMPARE(and_(w0, w0, 12), 0x121E0400);
+    SINGLE_COMPARE(and_(w0, w0, 2147483648), 0x12010000);
+}
+
 TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Loads")
 {
    // address forms
@ -359,11 +379,13 @@ TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOffsetSize")
    SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
 }

-TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "ConditionalSelect")
+TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Conditionals")
 {
    SINGLE_COMPARE(csel(x0, x1, x2, ConditionA64::Equal), 0x9A820020);
    SINGLE_COMPARE(csel(w0, w1, w2, ConditionA64::Equal), 0x1A820020);
    SINGLE_COMPARE(fcsel(d0, d1, d2, ConditionA64::Equal), 0x1E620C20);
+
+    SINGLE_COMPARE(cset(x1, ConditionA64::Less), 0x9A9FA7E1);
 }

 TEST_CASE("LogTest")
@ -394,6 +416,7 @@ TEST_CASE("LogTest")
    build.ldr(q1, x2);

    build.csel(x0, x1, x2, ConditionA64::Equal);
+    build.cset(x0, ConditionA64::Equal);

    build.fcmp(d0, d1);
    build.fcmpz(d0);
@ -423,6 +446,7 @@ TEST_CASE("LogTest")
 fabs        d1,d2
 ldr         q1,[x2]
 csel        x0,x1,x2,eq
+ cset        x0,eq
 fcmp        d0,d1
 fcmp        d0,#0
 .L1:
--- a/tests/AssemblyBuilderX64.test.cpp
+++ b/tests/AssemblyBuilderX64.test.cpp
@ -67,6 +67,9 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "BaseBinaryInstructionForms")
    SINGLE_COMPARE(add(rax, 0x7f), 0x48, 0x83, 0xc0, 0x7f);
    SINGLE_COMPARE(add(rax, 0x80), 0x48, 0x81, 0xc0, 0x80, 0x00, 0x00, 0x00);
    SINGLE_COMPARE(add(r10, 0x7fffffff), 0x49, 0x81, 0xc2, 0xff, 0xff, 0xff, 0x7f);
+    SINGLE_COMPARE(add(al, 3), 0x80, 0xc0, 0x03);
+    SINGLE_COMPARE(add(sil, 3), 0x48, 0x80, 0xc6, 0x03);
+    SINGLE_COMPARE(add(r11b, 3), 0x49, 0x80, 0xc3, 0x03);

    // reg, [reg]
    SINGLE_COMPARE(add(rax, qword[rax]), 0x48, 0x03, 0x00);
@ -191,6 +194,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMov")
    SINGLE_COMPARE(mov64(rcx, 0x1234567812345678ll), 0x48, 0xb9, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12);
    SINGLE_COMPARE(mov(ecx, 2), 0xb9, 0x02, 0x00, 0x00, 0x00);
    SINGLE_COMPARE(mov(cl, 2), 0xb1, 0x02);
+    SINGLE_COMPARE(mov(sil, 2), 0x48, 0xb6, 0x02);
+    SINGLE_COMPARE(mov(r9b, 2), 0x49, 0xb1, 0x02);
    SINGLE_COMPARE(mov(rcx, qword[rdi]), 0x48, 0x8b, 0x0f);
    SINGLE_COMPARE(mov(dword[rax], 0xabcd), 0xc7, 0x00, 0xcd, 0xab, 0x00, 0x00);
    SINGLE_COMPARE(mov(r13, 1), 0x49, 0xbd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
@ -201,6 +206,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMov")
    SINGLE_COMPARE(mov(qword[rdx], r9), 0x4c, 0x89, 0x0a);
    SINGLE_COMPARE(mov(byte[rsi], 0x3), 0xc6, 0x06, 0x03);
    SINGLE_COMPARE(mov(byte[rsi], al), 0x88, 0x06);
+    SINGLE_COMPARE(mov(byte[rsi], dil), 0x48, 0x88, 0x3e);
+    SINGLE_COMPARE(mov(byte[rsi], r10b), 0x4c, 0x88, 0x16);
 }

 TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMovExtended")
@ -229,6 +236,8 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfShift")
 {
    SINGLE_COMPARE(shl(al, 1), 0xd0, 0xe0);
    SINGLE_COMPARE(shl(al, cl), 0xd2, 0xe0);
+    SINGLE_COMPARE(shl(sil, cl), 0x48, 0xd2, 0xe6);
+    SINGLE_COMPARE(shl(r10b, cl), 0x49, 0xd2, 0xe2);
    SINGLE_COMPARE(shr(al, 4), 0xc0, 0xe8, 0x04);
    SINGLE_COMPARE(shr(eax, 1), 0xd1, 0xe8);
    SINGLE_COMPARE(sal(eax, cl), 0xd3, 0xe0);
@ -247,6 +256,7 @@ TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfLea")
 TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfSetcc")
 {
    SINGLE_COMPARE(setcc(ConditionX64::NotEqual, bl), 0x0f, 0x95, 0xc3);
+    SINGLE_COMPARE(setcc(ConditionX64::NotEqual, dil), 0x48, 0x0f, 0x95, 0xc7);
    SINGLE_COMPARE(setcc(ConditionX64::BelowEqual, byte[rcx]), 0x0f, 0x96, 0x01);
 }

--- a/tests/Autocomplete.test.cpp
+++ b/tests/Autocomplete.test.cpp
@ -3473,4 +3473,34 @@ TEST_CASE_FIXTURE(ACFixture, "autocomplete_response_perf1" * doctest::timeout(0.
    CHECK(ac.entryMap.count("Instance"));
 }

+TEST_CASE_FIXTURE(ACFixture, "strict_mode_force")
+{
+    check(R"(
+--!nonstrict
+local a: {x: number} = {x=1}
+local b = a
+local c = b.@1
+    )");
+
+    auto ac = autocomplete('1');
+
+    CHECK_EQ(1, ac.entryMap.size());
+    CHECK(ac.entryMap.count("x"));
+}
+
+TEST_CASE_FIXTURE(ACFixture, "suggest_exported_types")
+{
+    ScopedFastFlag luauCopyExportedTypes{"LuauCopyExportedTypes", true};
+
+    check(R"(
+export type Type = {a: number}
+local a: T@1
+    )");
+
+    auto ac = autocomplete('1');
+
+    CHECK(ac.entryMap.count("Type"));
+    CHECK_EQ(ac.context, AutocompleteContext::Type);
+}
+
 TEST_SUITE_END();
--- a/tests/CodeAllocator.test.cpp
+++ b/tests/CodeAllocator.test.cpp
@ -135,7 +135,8 @@ TEST_CASE("WindowsUnwindCodesX64")

    UnwindBuilderWin unwind;

-    unwind.start();
+    unwind.startInfo();
+    unwind.startFunction();
    unwind.spill(16, rdx);
    unwind.spill(8, rcx);
    unwind.save(rdi);
@ -148,14 +149,15 @@ TEST_CASE("WindowsUnwindCodesX64")
    unwind.save(r15);
    unwind.allocStack(72);
    unwind.setupFrameReg(rbp, 48);
-    unwind.finish();
+    unwind.finishFunction(0x11223344, 0x55443322);
+    unwind.finishInfo();

    std::vector<char> data;
    data.resize(unwind.getSize());
-    unwind.finalize(data.data(), nullptr, 0);
+    unwind.finalize(data.data(), 0, nullptr, 0);

-    std::vector<uint8_t> expected{0x01, 0x23, 0x0a, 0x35, 0x23, 0x33, 0x1e, 0x82, 0x1a, 0xf0, 0x18, 0xe0, 0x16, 0xd0, 0x14, 0xc0, 0x12, 0x50, 0x10,
-        0x30, 0x0e, 0x60, 0x0c, 0x70};
+    std::vector<uint8_t> expected{0x44, 0x33, 0x22, 0x11, 0x22, 0x33, 0x44, 0x55, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x23, 0x0a, 0x35, 0x23, 0x33, 0x1e,
+        0x82, 0x1a, 0xf0, 0x18, 0xe0, 0x16, 0xd0, 0x14, 0xc0, 0x12, 0x50, 0x10, 0x30, 0x0e, 0x60, 0x0c, 0x70};

    REQUIRE(data.size() == expected.size());
    CHECK(memcmp(data.data(), expected.data(), expected.size()) == 0);
@ -168,7 +170,8 @@ TEST_CASE("Dwarf2UnwindCodesX64")

    UnwindBuilderDwarf2 unwind;

-    unwind.start();
+    unwind.startInfo();
+    unwind.startFunction();
    unwind.save(rdi);
    unwind.save(rsi);
    unwind.save(rbx);
@ -179,11 +182,12 @@ TEST_CASE("Dwarf2UnwindCodesX64")
    unwind.save(r15);
    unwind.allocStack(72);
    unwind.setupFrameReg(rbp, 48);
-    unwind.finish();
+    unwind.finishFunction(0, 0);
+    unwind.finishInfo();

    std::vector<char> data;
    data.resize(unwind.getSize());
-    unwind.finalize(data.data(), nullptr, 0);
+    unwind.finalize(data.data(), 0, nullptr, 0);

    std::vector<uint8_t> expected{0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x78, 0x10, 0x0c, 0x07, 0x08, 0x05, 0x10, 0x01,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@ -211,6 +215,8 @@ constexpr X64::RegisterX64 rArg3 = X64::rdx;

 constexpr X64::RegisterX64 rNonVol1 = X64::r12;
 constexpr X64::RegisterX64 rNonVol2 = X64::rbx;
+constexpr X64::RegisterX64 rNonVol3 = X64::r13;
+constexpr X64::RegisterX64 rNonVol4 = X64::r14;

 TEST_CASE("GeneratedCodeExecutionX64")
 {
@ -260,7 +266,10 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
 #endif

-    unwind->start();
+    unwind->startInfo();
+
+    Label functionBegin = build.setLabel();
+    unwind->startFunction();

    // Prologue
    build.push(rNonVol1);
@ -279,8 +288,6 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
    build.lea(rbp, addr[rsp + stackSize]);
    unwind->setupFrameReg(rbp, stackSize);

-    unwind->finish();
-
    // Body
    build.mov(rNonVol1, rArg1);
    build.mov(rNonVol2, rArg2);
@ -296,8 +303,12 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
    build.pop(rNonVol1);
    build.ret();

+    unwind->finishFunction(build.getLabelOffset(functionBegin), ~0u);
+
    build.finalize();

+    unwind->finishInfo();
+
    size_t blockSize = 1024 * 1024;
    size_t maxTotalSize = 1024 * 1024;
    CodeAllocator allocator(blockSize, maxTotalSize);
@ -326,6 +337,152 @@ TEST_CASE("GeneratedCodeExecutionWithThrowX64")
    }
 }

+TEST_CASE("GeneratedCodeExecutionMultipleFunctionsWithThrowX64")
+{
+    using namespace X64;
+
+    AssemblyBuilderX64 build(/* logText= */ false);
+
+#if defined(_WIN32)
+    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderWin>();
+#else
+    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
+#endif
+
+    unwind->startInfo();
+
+    Label start1;
+    Label start2;
+
+    // First function
+    {
+        build.setLabel(start1);
+        unwind->startFunction();
+
+        // Prologue
+        build.push(rNonVol1);
+        unwind->save(rNonVol1);
+        build.push(rNonVol2);
+        unwind->save(rNonVol2);
+        build.push(rbp);
+        unwind->save(rbp);
+
+        int stackSize = 32;
+        int localsSize = 16;
+
+        build.sub(rsp, stackSize + localsSize);
+        unwind->allocStack(stackSize + localsSize);
+
+        build.lea(rbp, addr[rsp + stackSize]);
+        unwind->setupFrameReg(rbp, stackSize);
+
+        // Body
+        build.mov(rNonVol1, rArg1);
+        build.mov(rNonVol2, rArg2);
+
+        build.add(rNonVol1, 15);
+        build.mov(rArg1, rNonVol1);
+        build.call(rNonVol2);
+
+        // Epilogue
+        build.lea(rsp, addr[rbp + localsSize]);
+        build.pop(rbp);
+        build.pop(rNonVol2);
+        build.pop(rNonVol1);
+        build.ret();
+
+        Label end1 = build.setLabel();
+        unwind->finishFunction(build.getLabelOffset(start1), build.getLabelOffset(end1));
+    }
+
+    // Second function with different layout
+    {
+        build.setLabel(start2);
+        unwind->startFunction();
+
+        // Prologue
+        build.push(rNonVol1);
+        unwind->save(rNonVol1);
+        build.push(rNonVol2);
+        unwind->save(rNonVol2);
+        build.push(rNonVol3);
+        unwind->save(rNonVol3);
+        build.push(rNonVol4);
+        unwind->save(rNonVol4);
+        build.push(rbp);
+        unwind->save(rbp);
+
+        int stackSize = 32;
+        int localsSize = 32;
+
+        build.sub(rsp, stackSize + localsSize);
+        unwind->allocStack(stackSize + localsSize);
+
+        build.lea(rbp, addr[rsp + stackSize]);
+        unwind->setupFrameReg(rbp, stackSize);
+
+        // Body
+        build.mov(rNonVol3, rArg1);
+        build.mov(rNonVol4, rArg2);
+
+        build.add(rNonVol3, 15);
+        build.mov(rArg1, rNonVol3);
+        build.call(rNonVol4);
+
+        // Epilogue
+        build.lea(rsp, addr[rbp + localsSize]);
+        build.pop(rbp);
+        build.pop(rNonVol4);
+        build.pop(rNonVol3);
+        build.pop(rNonVol2);
+        build.pop(rNonVol1);
+        build.ret();
+
+        unwind->finishFunction(build.getLabelOffset(start2), ~0u);
+    }
+
+    build.finalize();
+
+    unwind->finishInfo();
+
+    size_t blockSize = 1024 * 1024;
+    size_t maxTotalSize = 1024 * 1024;
+    CodeAllocator allocator(blockSize, maxTotalSize);
+
+    allocator.context = unwind.get();
+    allocator.createBlockUnwindInfo = createBlockUnwindInfo;
+    allocator.destroyBlockUnwindInfo = destroyBlockUnwindInfo;
+
+    uint8_t* nativeData;
+    size_t sizeNativeData;
+    uint8_t* nativeEntry;
+    REQUIRE(allocator.allocate(build.data.data(), build.data.size(), build.code.data(), build.code.size(), nativeData, sizeNativeData, nativeEntry));
+    REQUIRE(nativeEntry);
+
+    using FunctionType = int64_t(int64_t, void (*)(int64_t));
+    FunctionType* f1 = (FunctionType*)(nativeEntry + start1.location);
+    FunctionType* f2 = (FunctionType*)(nativeEntry + start2.location);
+
+    // To simplify debugging, CHECK_THROWS_WITH_AS is not used here
+    try
+    {
+        f1(10, throwing);
+    }
+    catch (const std::runtime_error& error)
+    {
+        CHECK(strcmp(error.what(), "testing") == 0);
+    }
+
+    try
+    {
+        f2(10, throwing);
+    }
+    catch (const std::runtime_error& error)
+    {
+        CHECK(strcmp(error.what(), "testing") == 0);
+    }
+}
+
 TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
 {
    using namespace X64;
@ -338,7 +495,10 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
    std::unique_ptr<UnwindBuilder> unwind = std::make_unique<UnwindBuilderDwarf2>();
 #endif

-    unwind->start();
+    unwind->startInfo();
+
+    Label functionBegin = build.setLabel();
+    unwind->startFunction();

    // Prologue (some of these registers don't have to be saved, but we want to have a big prologue)
    build.push(r10);
@ -365,8 +525,6 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
    build.lea(rbp, addr[rsp + stackSize]);
    unwind->setupFrameReg(rbp, stackSize);

-    unwind->finish();
-
    size_t prologueSize = build.setLabel().location;

    // Body
@ -387,8 +545,12 @@ TEST_CASE("GeneratedCodeExecutionWithThrowOutsideTheGateX64")
    build.pop(r10);
    build.ret();

+    unwind->finishFunction(build.getLabelOffset(functionBegin), ~0u);
+
    build.finalize();

+    unwind->finishInfo();
+
    size_t blockSize = 4096; // Force allocate to create a new block each time
    size_t maxTotalSize = 1024 * 1024;
    CodeAllocator allocator(blockSize, maxTotalSize);
--- a/tests/Conformance.test.cpp
+++ b/tests/Conformance.test.cpp
@ -285,8 +285,16 @@ TEST_CASE("Tables")
        lua_pushcfunction(
            L,
            [](lua_State* L) {
-                unsigned v = luaL_checkunsigned(L, 1);
-                lua_pushlightuserdata(L, reinterpret_cast<void*>(uintptr_t(v)));
+                if (lua_type(L, 1) == LUA_TNUMBER)
+                {
+                    unsigned v = luaL_checkunsigned(L, 1);
+                    lua_pushlightuserdata(L, reinterpret_cast<void*>(uintptr_t(v)));
+                }
+                else
+                {
+                    const void* p = lua_topointer(L, 1);
+                    lua_pushlightuserdata(L, const_cast<void*>(p));
+                }
                return 1;
            },
            "makelud");
@ -402,21 +410,24 @@ TEST_CASE("PCall")
 {
    ScopedFastFlag sff("LuauBetterOOMHandling", true);

-    runConformance("pcall.lua", [](lua_State* L) {
-        lua_pushcfunction(L, cxxthrow, "cxxthrow");
-        lua_setglobal(L, "cxxthrow");
+    runConformance(
+        "pcall.lua",
+        [](lua_State* L) {
+            lua_pushcfunction(L, cxxthrow, "cxxthrow");
+            lua_setglobal(L, "cxxthrow");

-        lua_pushcfunction(
-            L,
-            [](lua_State* L) -> int {
-                lua_State* co = lua_tothread(L, 1);
-                lua_xmove(L, co, 1);
-                lua_resumeerror(co, L);
-                return 0;
-            },
-            "resumeerror");
-        lua_setglobal(L, "resumeerror");
-    }, nullptr, lua_newstate(limitedRealloc, nullptr));
+            lua_pushcfunction(
+                L,
+                [](lua_State* L) -> int {
+                    lua_State* co = lua_tothread(L, 1);
+                    lua_xmove(L, co, 1);
+                    lua_resumeerror(co, L);
+                    return 0;
+                },
+                "resumeerror");
+            lua_setglobal(L, "resumeerror");
+        },
+        nullptr, lua_newstate(limitedRealloc, nullptr));
 }

 TEST_CASE("Pack")
--- a/tests/Fixture.cpp
+++ b/tests/Fixture.cpp
@ -21,6 +21,7 @@
 static const char* mainModuleName = "MainModule";

 LUAU_FASTFLAG(DebugLuauDeferredConstraintResolution);
+LUAU_FASTFLAG(LuauOnDemandTypecheckers);

 extern std::optional<unsigned> randomSeed; // tests/main.cpp

@ -180,9 +181,16 @@ AstStatBlock* Fixture::parse(const std::string& source, const ParseOptions& pars

                Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
            }
+            else if (!FFlag::LuauOnDemandTypecheckers)
+            {
+                ModulePtr module = frontend.typeChecker_DEPRECATED.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict));
+
+                Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
+            }
            else
            {
-                ModulePtr module = frontend.typeChecker.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict));
+                TypeChecker typeChecker(frontend.globals.globalScope, &moduleResolver, builtinTypes, &frontend.iceHandler);
+                ModulePtr module = typeChecker.check(*sourceModule, sourceModule->mode.value_or(Luau::Mode::Nonstrict), std::nullopt);

                Luau::lint(sourceModule->root, *sourceModule->names, frontend.globals.globalScope, module.get(), sourceModule->hotcomments, {});
            }
--- a/tests/Module.test.cpp
+++ b/tests/Module.test.cpp
@ -3,6 +3,7 @@
 #include "Luau/Module.h"
 #include "Luau/Scope.h"
 #include "Luau/RecursionCounter.h"
+#include "Luau/Parser.h"

 #include "Fixture.h"

@ -42,6 +43,38 @@ TEST_CASE_FIXTURE(Fixture, "is_within_comment")
    CHECK(!isWithinComment(*sm, Position{7, 11}));
 }

+TEST_CASE_FIXTURE(Fixture, "is_within_comment_parse_result")
+{
+    std::string src = R"(
+        --!strict
+        local foo = {}
+        function foo:bar() end
+
+        --[[
+            foo:
+        ]] foo:bar()
+
+        --[[]]--[[]] -- Two distinct comments that have zero characters of space between them.
+    )";
+
+    Luau::Allocator alloc;
+    Luau::AstNameTable names{alloc};
+    Luau::ParseOptions parseOptions;
+    parseOptions.captureComments = true;
+    Luau::ParseResult parseResult = Luau::Parser::parse(src.data(), src.size(), names, alloc, parseOptions);
+
+    CHECK_EQ(5, parseResult.commentLocations.size());
+
+    CHECK(isWithinComment(parseResult, Position{1, 15}));
+    CHECK(isWithinComment(parseResult, Position{6, 16}));
+    CHECK(isWithinComment(parseResult, Position{9, 13}));
+    CHECK(isWithinComment(parseResult, Position{9, 14}));
+
+    CHECK(!isWithinComment(parseResult, Position{2, 15}));
+    CHECK(!isWithinComment(parseResult, Position{7, 10}));
+    CHECK(!isWithinComment(parseResult, Position{7, 11}));
+}
+
 TEST_CASE_FIXTURE(Fixture, "dont_clone_persistent_primitive")
 {
    TypeArena dest;
@ -319,6 +352,10 @@ TEST_CASE_FIXTURE(Fixture, "clone_recursion_limit")

 TEST_CASE_FIXTURE(Fixture, "any_persistance_does_not_leak")
 {
+    ScopedFastFlag flags[] = {
+        {"LuauOccursIsntAlwaysFailure", true},
+    };
+
    fileResolver.source["Module/A"] = R"(
 export type A = B
 type B = A
@ -332,7 +369,7 @@ type B = A
    auto mod = frontend.moduleResolver.getModule("Module/A");
    auto it = mod->exportedTypeBindings.find("A");
    REQUIRE(it != mod->exportedTypeBindings.end());
-    CHECK(toString(it->second.type) == "any");
+    CHECK(toString(it->second.type) == "*error-type*");
 }

 TEST_CASE_FIXTURE(BuiltinsFixture, "do_not_clone_reexports")
--- a/tests/StringUtils.test.cpp
+++ b/tests/StringUtils.test.cpp
@ -106,4 +106,22 @@ TEST_CASE("AreWeUsingDistanceWithAdjacentTranspositionsAndNotOptimalStringAlignm
    CHECK_EQ(distance, 2);
 }

+TEST_CASE("EditDistanceSupportsUnicode")
+{
+    // ASCII character
+    CHECK_EQ(Luau::editDistance("A block", "X block"), 1);
+
+    // UTF-8 2 byte character
+    CHECK_EQ(Luau::editDistance("A block", "À block"), 2);
+
+    // UTF-8 3 byte character
+    CHECK_EQ(Luau::editDistance("A block", "⪻ block"), 3);
+
+    // UTF-8 4 byte character
+    CHECK_EQ(Luau::editDistance("A block", "𒋄 block"), 4);
+
+    // UTF-8 extreme characters
+    CHECK_EQ(Luau::editDistance("A block", "R̴̨̢̟̚ŏ̶̳̳͚́ͅb̶̡̻̞̐̿ͅl̸̼͝ợ̷̜͓̒̏͜͝ẍ̴̝̦̟̰́̒́̌ block"), 85);
+}
+
 TEST_SUITE_END();
--- a/tests/TypeInfer.annotations.test.cpp
+++ b/tests/TypeInfer.annotations.test.cpp
@ -435,6 +435,10 @@ TEST_CASE_FIXTURE(Fixture, "typeof_expr")

 TEST_CASE_FIXTURE(Fixture, "corecursive_types_error_on_tight_loop")
 {
+    ScopedFastFlag flags[] = {
+        {"LuauOccursIsntAlwaysFailure", true},
+    };
+
    CheckResult result = check(R"(
        type A = B
        type B = A
@ -443,10 +447,10 @@ TEST_CASE_FIXTURE(Fixture, "corecursive_types_error_on_tight_loop")
        local bb:B
    )");

-    TypeId fType = requireType("aa");
-    const AnyType* ftv = get<AnyType>(follow(fType));
-    REQUIRE(ftv != nullptr);
-    REQUIRE(!result.errors.empty());
+    LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    OccursCheckFailed* ocf = get<OccursCheckFailed>(result.errors[0]);
+    REQUIRE(ocf);
 }

 TEST_CASE_FIXTURE(Fixture, "type_alias_always_resolve_to_a_real_type")
@ -762,6 +766,7 @@ TEST_CASE_FIXTURE(Fixture, "occurs_check_on_cyclic_union_type")
 {
    CheckResult result = check(R"(
        type T = T | T
+        local x : T
    )");

    LUAU_REQUIRE_ERROR_COUNT(1, result);
--- a/tests/TypeInfer.functions.test.cpp
+++ b/tests/TypeInfer.functions.test.cpp
@ -1281,6 +1281,39 @@ f(function(x) return x * 2 end)
    LUAU_REQUIRE_NO_ERRORS(result);
 }

+TEST_CASE_FIXTURE(Fixture, "variadic_any_is_compatible_with_a_generic_TypePack")
+{
+    ScopedFastFlag sff[] = {
+        {"LuauVariadicAnyCanBeGeneric", true}
+    };
+
+    CheckResult result = check(R"(
+        --!strict
+        local function f(...) return ... end
+        local g = function(...) return f(...) end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
+// https://github.com/Roblox/luau/issues/767
+TEST_CASE_FIXTURE(BuiltinsFixture, "variadic_any_is_compatible_with_a_generic_TypePack_2")
+{
+    ScopedFastFlag sff{"LuauVariadicAnyCanBeGeneric", true};
+
+    CheckResult result = check(R"(
+        local function somethingThatsAny(...: any)
+            print(...)
+        end
+
+        local function x<T...>(...: T...)
+            somethingThatsAny(...) -- Failed to unify variadic type packs
+        end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_CASE_FIXTURE(Fixture, "infer_anonymous_function_arguments_outside_call")
 {
    CheckResult result = check(R"(
--- a/tests/TypeInfer.operators.test.cpp
+++ b/tests/TypeInfer.operators.test.cpp
@ -53,10 +53,6 @@ TEST_CASE_FIXTURE(Fixture, "or_joins_types_with_no_superfluous_union")

 TEST_CASE_FIXTURE(Fixture, "and_does_not_always_add_boolean")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
    CheckResult result = check(R"(
        local s = "a" and 10
        local x:boolean|number = s
@ -737,6 +733,8 @@ TEST_CASE_FIXTURE(Fixture, "error_on_invalid_operand_types_to_relational_operato

 TEST_CASE_FIXTURE(Fixture, "cli_38355_recursive_union")
 {
+    ScopedFastFlag sff{"LuauOccursIsntAlwaysFailure", true};
+
    CheckResult result = check(R"(
        --!strict
        local _
@ -744,7 +742,7 @@ TEST_CASE_FIXTURE(Fixture, "cli_38355_recursive_union")
    )");

    LUAU_REQUIRE_ERROR_COUNT(1, result);
-    CHECK_EQ("Type contains a self-recursive construct that cannot be resolved", toString(result.errors[0]));
+    CHECK_EQ("Unknown type used in + operation; consider adding a type annotation to '_'", toString(result.errors[0]));
 }

 TEST_CASE_FIXTURE(BuiltinsFixture, "UnknownGlobalCompoundAssign")
@ -1048,10 +1046,6 @@ TEST_CASE_FIXTURE(BuiltinsFixture, "mm_comparisons_must_return_a_boolean")

 TEST_CASE_FIXTURE(BuiltinsFixture, "reworked_and")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
    CheckResult result = check(R"(
 local a: number? = 5
 local b: boolean = (a or 1) > 10
@ -1077,10 +1071,6 @@ local w = c and 1

 TEST_CASE_FIXTURE(BuiltinsFixture, "reworked_or")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-    };
-
    CheckResult result = check(R"(
 local a: number | false = 5
 local b: number? = 6
@ -1115,11 +1105,6 @@ local f1 = f or 'f'

 TEST_CASE_FIXTURE(BuiltinsFixture, "reducing_and")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-        {"LuauReducingAndOr", true},
-    };
-
    CheckResult result = check(R"(
 type Foo = { name: string?, flag: boolean? }
 local arr: {Foo} = {}
@ -1137,4 +1122,61 @@ end
    LUAU_REQUIRE_NO_ERRORS(result);
 }

+TEST_CASE_FIXTURE(BuiltinsFixture, "luau_polyfill_is_array_simplified")
+{
+    CheckResult result = check(R"(
+     --!strict
+     return function(value: any) : boolean
+        if typeof(value) ~= "number" then
+           return false
+        end
+        if value % 1 ~= 0 or value < 1 then
+           return false
+        end
+        return true
+     end 
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
+TEST_CASE_FIXTURE(BuiltinsFixture, "luau_polyfill_is_array")
+{
+    CheckResult result = check(R"(
+--!strict
+return function(value: any): boolean
+    if typeof(value) ~= "table" then
+        return false
+    end
+    if next(value) == nil then
+        -- an empty table is an empty array
+        return true
+    end
+
+    local length = #value
+
+    if length == 0 then
+        return false
+    end
+
+    local count = 0
+    local sum = 0
+    for key in pairs(value) do
+        if typeof(key) ~= "number" then
+            return false
+        end
+        if key % 1 ~= 0 or key < 1 then
+            return false
+        end
+        count += 1
+        sum += key
+    end
+
+    return sum == (count * (count + 1) / 2)
+end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_SUITE_END();
--- a/tests/TypeInfer.provisional.test.cpp
+++ b/tests/TypeInfer.provisional.test.cpp
@ -320,23 +320,6 @@ TEST_CASE_FIXTURE(Fixture, "weird_fail_to_unify_type_pack")
    LUAU_REQUIRE_ERRORS(result); // Should not have any errors.
 }

-TEST_CASE_FIXTURE(Fixture, "weird_fail_to_unify_variadic_pack")
-{
-    ScopedFastFlag sff[] = {
-        // I'm not sure why this is broken without DCR, but it seems to be fixed
-        // when DCR is enabled.
-        {"DebugLuauDeferredConstraintResolution", false},
-    };
-
-    CheckResult result = check(R"(
-        --!strict
-        local function f(...) return ... end
-        local g = function(...) return f(...) end
-    )");
-
-    LUAU_REQUIRE_ERRORS(result); // Should not have any errors.
-}
-
 // Belongs in TypeInfer.builtins.test.cpp.
 TEST_CASE_FIXTURE(BuiltinsFixture, "pcall_returns_at_least_two_value_but_function_returns_nothing")
 {
@ -819,4 +802,23 @@ TEST_CASE_FIXTURE(BuiltinsFixture, "table_insert_with_a_singleton_argument")
    }
 }

+// We really should be warning on this.  We have no guarantee that T has any properties.
+TEST_CASE_FIXTURE(Fixture, "lookup_prop_of_intersection_containing_unions_of_tables_that_have_the_prop")
+{
+    CheckResult result = check(R"(
+        local function mergeOptions<T>(options: T & ({variable: string} | {variable: number}))
+            return options.variable
+        end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+
+    // LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    // const UnknownProperty* unknownProp = get<UnknownProperty>(result.errors[0]);
+    // REQUIRE(unknownProp);
+
+    // CHECK("variable" == unknownProp->key);
+}
+
 TEST_SUITE_END();
--- a/tests/TypeInfer.test.cpp
+++ b/tests/TypeInfer.test.cpp
@ -1195,6 +1195,21 @@ local b = typeof(foo) ~= 'nil'
    CHECK(toString(result.errors[1]) == "Unknown global 'foo'");
 }

+TEST_CASE_FIXTURE(Fixture, "occurs_isnt_always_failure")
+{
+    ScopedFastFlag sff{"LuauOccursIsntAlwaysFailure", true};
+
+    CheckResult result = check(R"(
+function f(x, c)                   -- x : X
+    local y = if c then x else nil -- y : X?
+    local z = if c then x else nil -- z : X?
+    y = z
+end
+    )");
+
+    LUAU_REQUIRE_NO_ERRORS(result);
+}
+
 TEST_CASE_FIXTURE(Fixture, "dcr_delays_expansion_of_function_containing_blocked_parameter_type")
 {
    ScopedFastFlag sff[] = {
--- a/tests/TypeInfer.unionTypes.test.cpp
+++ b/tests/TypeInfer.unionTypes.test.cpp
@ -776,4 +776,20 @@ TEST_CASE_FIXTURE(Fixture, "generic_function_with_optional_arg")
    LUAU_REQUIRE_NO_ERRORS(result);
 }

+TEST_CASE_FIXTURE(Fixture, "lookup_prop_of_intersection_containing_unions")
+{
+    CheckResult result = check(R"(
+        local function mergeOptions<T>(options: T & ({} | {}))
+            return options.variables
+        end
+    )");
+
+    LUAU_REQUIRE_ERROR_COUNT(1, result);
+
+    const UnknownProperty* unknownProp = get<UnknownProperty>(result.errors[0]);
+    REQUIRE(unknownProp);
+
+    CHECK("variables" == unknownProp->key);
+}
+
 TEST_SUITE_END();
--- a/tests/TypeInfer.unknownnever.test.cpp
+++ b/tests/TypeInfer.unknownnever.test.cpp
@ -301,11 +301,6 @@ TEST_CASE_FIXTURE(Fixture, "length_of_never")

 TEST_CASE_FIXTURE(Fixture, "dont_unify_operands_if_one_of_the_operand_is_never_in_any_ordering_operators")
 {
-    ScopedFastFlag sff[]{
-        {"LuauTryhardAnd", true},
-        {"LuauReducingAndOr", true},
-    };
-
    CheckResult result = check(R"(
        local function ord(x: nil, y)
            return x ~= nil and x > y
--- a/tests/TypeVar.test.cpp
+++ b/tests/TypeVar.test.cpp
@ -273,12 +273,14 @@ TEST_CASE_FIXTURE(Fixture, "substitution_skip_failure")

    TypeId root = &ttvTweenResult;

-    frontend.typeChecker.currentModule = std::make_shared<Module>();
-    frontend.typeChecker.currentModule->scopes.emplace_back(Location{}, std::make_shared<Scope>(builtinTypes->anyTypePack));
+    ModulePtr currentModule = std::make_shared<Module>();
+    Anyification anyification(&currentModule->internalTypes, frontend.globals.globalScope, builtinTypes, &frontend.iceHandler, builtinTypes->anyType,
+        builtinTypes->anyTypePack);
+    std::optional<TypeId> any = anyification.substitute(root);

-    TypeId result = frontend.typeChecker.anyify(frontend.globals.globalScope, root, Location{});
-
-    CHECK_EQ("{| f: t1 |} where t1 = () -> {| f: () -> {| f: ({| f: t1 |}) -> (), signal: {| f: (any) -> () |} |} |}", toString(result));
+    REQUIRE(!anyification.normalizationTooComplex);
+    REQUIRE(any.has_value());
+    CHECK_EQ("{| f: t1 |} where t1 = () -> {| f: () -> {| f: ({| f: t1 |}) -> (), signal: {| f: (any) -> () |} |} |}", toString(*any));
 }

 TEST_CASE("tagging_tables")
--- a/tests/conformance/math.lua
+++ b/tests/conformance/math.lua
@ -347,5 +347,15 @@ assert(select('#', math.ceil(1.6)) == 1)
 assert(select('#', math.sqrt(9)) == 1)
 assert(select('#', math.deg(9)) == 1)
 assert(select('#', math.rad(9)) == 1)
+assert(select('#', math.sin(1.5)) == 1)
+assert(select('#', math.atan2(1.5, 0.5)) == 1)
+assert(select('#', math.modf(1.5)) == 2)
+assert(select('#', math.frexp(1.5)) == 2)
+
+-- test that fastcalls that return variadic results return them correctly in variadic position
+assert(select(1, math.modf(1.5)) == 1)
+assert(select(2, math.modf(1.5)) == 0.5)
+assert(select(1, math.frexp(1.5)) == 0.75)
+assert(select(2, math.frexp(1.5)) == 1)

 return('OK')
--- a/tests/conformance/tables.lua
+++ b/tests/conformance/tables.lua
@ -715,4 +715,11 @@ do
  end
 end

+-- check that fast path for table lookup can't be tricked into assuming a light user data with string pointer is a string
+assert((function ()
+  local t = {}
+  t[makelud("hi")] = "no"
+  return t.hi
+end)() == nil)
+
 return"OK"
--- a/tools/lvmexecute_split.py
+++ b/tools/lvmexecute_split.py
@ -34,7 +34,7 @@ source = """// This file is part of the Luau programming language and is license
 function = ""
 signature = ""

-includeInsts = ["LOP_NEWCLOSURE", "LOP_NAMECALL", "LOP_FORGPREP", "LOP_GETVARARGS", "LOP_DUPCLOSURE", "LOP_PREPVARARGS", "LOP_BREAK", "LOP_GETGLOBAL", "LOP_SETGLOBAL", "LOP_GETTABLEKS", "LOP_SETTABLEKS"]
+includeInsts = ["LOP_NEWCLOSURE", "LOP_NAMECALL", "LOP_FORGPREP", "LOP_GETVARARGS", "LOP_DUPCLOSURE", "LOP_PREPVARARGS", "LOP_BREAK", "LOP_GETGLOBAL", "LOP_SETGLOBAL", "LOP_GETTABLEKS", "LOP_SETTABLEKS", "LOP_SETLIST"]

 state = 0