luau/tests/AssemblyBuilderA64.test.cpp
Arseny Kapoulkine e6bf71871a
Some checks failed
benchmark / callgrind (map[branch:main name:luau-lang/benchmark-data], ubuntu-22.04) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:macos version:macos-latest]) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:macos-arm version:macos-14]) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:ubuntu version:ubuntu-latest]) (push) Has been cancelled
build / windows (Win32) (push) Has been cancelled
build / windows (x64) (push) Has been cancelled
build / coverage (push) Has been cancelled
build / web (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:macos version:macos-latest]) (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:ubuntu version:ubuntu-20.04]) (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:windows version:windows-latest]) (push) Has been cancelled
release / web (push) Has been cancelled
CodeGen: Rewrite dot product lowering using a dedicated IR instruction (#1512)
Instead of doing the dot product related math in scalar IR, we lift the
computation into a dedicated IR instruction.

On x64, we can use VDPPS which was more or less tailor made for this
purpose. This is better than manual scalar lowering that requires
reloading components from memory; it's not always a strict improvement
over the shuffle+add version (which we never had), but this can now be
adjusted in the IR lowering in an optimal fashion (maybe even based on
CPU vendor, although that'd create issues for offline compilation).

On A64, we can either use naive adds or paired adds, as there is no
dedicated vector-wide horizontal instruction until SVE. Both run at
about the same performance on M2, but paired adds require fewer
instructions and temporaries.

I've measured this using mesh-normal-vector benchmark, changing the
benchmark to just report the time of the second loop inside
`calculate_normals`, testing master vs #1504 vs this PR, also increasing
the grid size to 400 for more stable timings.

On Zen 4 (7950X), this PR is comfortably ~8% faster vs master, while I
see neutral to negative results in #1504.
On M2 (base), this PR is ~28% faster vs master, while #1504 is only
about ~10% faster.

If I measure the second loop in `calculate_tangent_space` instead, I
get:

On Zen 4 (7950X), this PR is ~12% faster vs master, while #1504 is ~3%
faster
On M2 (base), this PR is ~24% faster vs master, while #1504 is only
about ~13% faster.

Note that the loops in question are not quite optimal, as they store and
reload various vectors to dictionary values due to inappropriate use of
locals. The underlying gains in individual functions are thus larger
than the numbers above; for example, changing the `calculate_normals`
loop to use a local variable to store the normalized vector (but still
saving the result to dictionary value), I get a ~24% performance
increase from this PR on Zen4 vs master instead of just 8% (#1504 is
~15% slower in this setup).
2024-11-08 16:23:09 -08:00

651 lines
19 KiB
C++

// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
#include "Luau/AssemblyBuilderA64.h"
#include "Luau/StringUtils.h"
#include "ScopedFlags.h"
#include "doctest.h"
#include <string.h>
using namespace Luau::CodeGen;
using namespace Luau::CodeGen::A64;
static std::string bytecodeAsArray(const std::vector<uint8_t>& bytecode)
{
std::string result = "{";
for (size_t i = 0; i < bytecode.size(); i++)
Luau::formatAppend(result, "%s0x%02x", i == 0 ? "" : ", ", bytecode[i]);
return result.append("}");
}
static std::string bytecodeAsArray(const std::vector<uint32_t>& code)
{
std::string result = "{";
for (size_t i = 0; i < code.size(); i++)
Luau::formatAppend(result, "%s0x%08x", i == 0 ? "" : ", ", code[i]);
return result.append("}");
}
class AssemblyBuilderA64Fixture
{
public:
bool check(void (*f)(AssemblyBuilderA64& build), std::vector<uint32_t> code, std::vector<uint8_t> data = {}, unsigned int features = 0)
{
AssemblyBuilderA64 build(/* logText= */ false, features);
f(build);
build.finalize();
if (build.code != code)
{
printf("Expected code: %s\nReceived code: %s\n", bytecodeAsArray(code).c_str(), bytecodeAsArray(build.code).c_str());
return false;
}
if (build.data != data)
{
printf("Expected data: %s\nReceived data: %s\n", bytecodeAsArray(data).c_str(), bytecodeAsArray(build.data).c_str());
return false;
}
return true;
}
};
// armconverter.com can be used to validate instruction sequences
TEST_SUITE_BEGIN("A64Assembly");
#define SINGLE_COMPARE(inst, ...) \
CHECK(check( \
[](AssemblyBuilderA64& build) \
{ \
build.inst; \
}, \
{__VA_ARGS__} \
))
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Unary")
{
SINGLE_COMPARE(neg(x0, x1), 0xCB0103E0);
SINGLE_COMPARE(neg(w0, w1), 0x4B0103E0);
SINGLE_COMPARE(mvn_(x0, x1), 0xAA2103E0);
SINGLE_COMPARE(clz(x0, x1), 0xDAC01020);
SINGLE_COMPARE(clz(w0, w1), 0x5AC01020);
SINGLE_COMPARE(rbit(x0, x1), 0xDAC00020);
SINGLE_COMPARE(rbit(w0, w1), 0x5AC00020);
SINGLE_COMPARE(rev(w0, w1), 0x5AC00820);
SINGLE_COMPARE(rev(x0, x1), 0xDAC00C20);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
{
// reg, reg
SINGLE_COMPARE(add(x0, x1, x2), 0x8B020020);
SINGLE_COMPARE(add(w0, w1, w2), 0x0B020020);
SINGLE_COMPARE(add(x0, x1, x2, 7), 0x8B021C20);
SINGLE_COMPARE(add(x0, x1, x2, -7), 0x8B421C20);
SINGLE_COMPARE(sub(x0, x1, x2), 0xCB020020);
SINGLE_COMPARE(and_(x0, x1, x2), 0x8A020020);
SINGLE_COMPARE(and_(x0, x1, x2, 7), 0x8A021C20);
SINGLE_COMPARE(and_(x0, x1, x2, -7), 0x8A421C20);
SINGLE_COMPARE(bic(x0, x1, x2), 0x8A220020);
SINGLE_COMPARE(orr(x0, x1, x2), 0xAA020020);
SINGLE_COMPARE(eor(x0, x1, x2), 0xCA020020);
SINGLE_COMPARE(lsl(x0, x1, x2), 0x9AC22020);
SINGLE_COMPARE(lsl(w0, w1, w2), 0x1AC22020);
SINGLE_COMPARE(lsr(x0, x1, x2), 0x9AC22420);
SINGLE_COMPARE(asr(x0, x1, x2), 0x9AC22820);
SINGLE_COMPARE(ror(x0, x1, x2), 0x9AC22C20);
SINGLE_COMPARE(cmp(x0, x1), 0xEB01001F);
SINGLE_COMPARE(tst(x0, x1), 0xEA01001F);
// reg, imm
SINGLE_COMPARE(add(x3, x7, 78), 0x910138E3);
SINGLE_COMPARE(add(w3, w7, 78), 0x110138E3);
SINGLE_COMPARE(sub(w3, w7, 78), 0x510138E3);
SINGLE_COMPARE(cmp(w0, 42), 0x7100A81F);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryExtended")
{
// reg, reg
SINGLE_COMPARE(add(x0, x1, w2, 3), 0x8B224C20);
SINGLE_COMPARE(sub(x0, x1, w2, 3), 0xCB224C20);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryImm")
{
// instructions
SINGLE_COMPARE(and_(w1, w2, 1), 0x12000041);
SINGLE_COMPARE(orr(w1, w2, 1), 0x32000041);
SINGLE_COMPARE(eor(w1, w2, 1), 0x52000041);
SINGLE_COMPARE(tst(w1, 1), 0x7200003f);
// various mask forms
SINGLE_COMPARE(and_(w0, w0, 1), 0x12000000);
SINGLE_COMPARE(and_(w0, w0, 3), 0x12000400);
SINGLE_COMPARE(and_(w0, w0, 7), 0x12000800);
SINGLE_COMPARE(and_(w0, w0, 2147483647), 0x12007800);
SINGLE_COMPARE(and_(w0, w0, 6), 0x121F0400);
SINGLE_COMPARE(and_(w0, w0, 12), 0x121E0400);
SINGLE_COMPARE(and_(w0, w0, 2147483648), 0x12010000);
// shifts
SINGLE_COMPARE(lsl(w1, w2, 1), 0x531F7841);
SINGLE_COMPARE(lsl(x1, x2, 1), 0xD37FF841);
SINGLE_COMPARE(lsr(w1, w2, 1), 0x53017C41);
SINGLE_COMPARE(lsr(x1, x2, 1), 0xD341FC41);
SINGLE_COMPARE(asr(w1, w2, 1), 0x13017C41);
SINGLE_COMPARE(asr(x1, x2, 1), 0x9341FC41);
SINGLE_COMPARE(ror(w1, w2, 1), 0x13820441);
SINGLE_COMPARE(ror(x1, x2, 1), 0x93C20441);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Bitfield")
{
SINGLE_COMPARE(ubfiz(x1, x2, 37, 5), 0xD35B1041);
SINGLE_COMPARE(ubfx(x1, x2, 37, 5), 0xD365A441);
SINGLE_COMPARE(sbfiz(x1, x2, 37, 5), 0x935B1041);
SINGLE_COMPARE(sbfx(x1, x2, 37, 5), 0x9365A441);
SINGLE_COMPARE(ubfiz(w1, w2, 17, 5), 0x530F1041);
SINGLE_COMPARE(ubfx(w1, w2, 17, 5), 0x53115441);
SINGLE_COMPARE(sbfiz(w1, w2, 17, 5), 0x130F1041);
SINGLE_COMPARE(sbfx(w1, w2, 17, 5), 0x13115441);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Loads")
{
// address forms
SINGLE_COMPARE(ldr(x0, x1), 0xF9400020);
SINGLE_COMPARE(ldr(x0, mem(x1, 8)), 0xF9400420);
SINGLE_COMPARE(ldr(x0, mem(x1, x7)), 0xF8676820);
SINGLE_COMPARE(ldr(x0, mem(x1, -7)), 0xF85F9020);
// load sizes
SINGLE_COMPARE(ldr(x0, x1), 0xF9400020);
SINGLE_COMPARE(ldr(w0, x1), 0xB9400020);
SINGLE_COMPARE(ldrb(w0, x1), 0x39400020);
SINGLE_COMPARE(ldrh(w0, x1), 0x79400020);
SINGLE_COMPARE(ldrsb(x0, x1), 0x39800020);
SINGLE_COMPARE(ldrsb(w0, x1), 0x39C00020);
SINGLE_COMPARE(ldrsh(x0, x1), 0x79800020);
SINGLE_COMPARE(ldrsh(w0, x1), 0x79C00020);
SINGLE_COMPARE(ldrsw(x0, x1), 0xB9800020);
// load sizes x offset scaling
SINGLE_COMPARE(ldr(x0, mem(x1, 8)), 0xF9400420);
SINGLE_COMPARE(ldr(w0, mem(x1, 8)), 0xB9400820);
SINGLE_COMPARE(ldrb(w0, mem(x1, 8)), 0x39402020);
SINGLE_COMPARE(ldrh(w0, mem(x1, 8)), 0x79401020);
SINGLE_COMPARE(ldrsb(w0, mem(x1, 8)), 0x39C02020);
SINGLE_COMPARE(ldrsh(w0, mem(x1, 8)), 0x79C01020);
// paired loads
SINGLE_COMPARE(ldp(x0, x1, mem(x2, 8)), 0xA9408440);
SINGLE_COMPARE(ldp(w0, w1, mem(x2, -8)), 0x297F0440);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Stores")
{
// address forms
SINGLE_COMPARE(str(x0, x1), 0xF9000020);
SINGLE_COMPARE(str(x0, mem(x1, 8)), 0xF9000420);
SINGLE_COMPARE(str(x0, mem(x1, x7)), 0xF8276820);
SINGLE_COMPARE(strh(w0, mem(x1, -7)), 0x781F9020);
// store sizes
SINGLE_COMPARE(str(x0, x1), 0xF9000020);
SINGLE_COMPARE(str(w0, x1), 0xB9000020);
SINGLE_COMPARE(strb(w0, x1), 0x39000020);
SINGLE_COMPARE(strh(w0, x1), 0x79000020);
// store sizes x offset scaling
SINGLE_COMPARE(str(x0, mem(x1, 8)), 0xF9000420);
SINGLE_COMPARE(str(w0, mem(x1, 8)), 0xB9000820);
SINGLE_COMPARE(strb(w0, mem(x1, 8)), 0x39002020);
SINGLE_COMPARE(strh(w0, mem(x1, 8)), 0x79001020);
// paired stores
SINGLE_COMPARE(stp(x0, x1, mem(x2, 8)), 0xA9008440);
SINGLE_COMPARE(stp(w0, w1, mem(x2, -8)), 0x293F0440);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves")
{
SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0);
SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0);
SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20);
SINGLE_COMPARE(movz(x0, 42), 0xD2800540);
SINGLE_COMPARE(movz(w0, 42), 0x52800540);
SINGLE_COMPARE(movn(x0, 42), 0x92800540);
SINGLE_COMPARE(movn(w0, 42), 0x12800540);
SINGLE_COMPARE(movk(x0, 42, 16), 0xF2A00540);
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, 42);
},
{0xD2800540}
));
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, 424242);
},
{0xD28F2640, 0xF2A000C0}
));
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, -42);
},
{0x92800520}
));
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, -424242);
},
{0x928F2620, 0xF2BFFF20}
));
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, -65536);
},
{0x929FFFE0}
));
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.mov(x0, -65537);
},
{0x92800000, 0xF2BFFFC0}
));
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "ControlFlow")
{
// Jump back
CHECK(check(
[](AssemblyBuilderA64& build)
{
Label start = build.setLabel();
build.mov(x0, x1);
build.b(ConditionA64::Equal, start);
},
{0xAA0103E0, 0x54FFFFE0}
));
// Jump forward
CHECK(check(
[](AssemblyBuilderA64& build)
{
Label skip;
build.b(ConditionA64::Equal, skip);
build.mov(x0, x1);
build.setLabel(skip);
},
{0x54000040, 0xAA0103E0}
));
// Jumps
CHECK(check(
[](AssemblyBuilderA64& build)
{
Label skip;
build.b(ConditionA64::Equal, skip);
build.cbz(x0, skip);
build.cbnz(x0, skip);
build.tbz(x0, 5, skip);
build.tbnz(x0, 5, skip);
build.setLabel(skip);
build.b(skip);
build.bl(skip);
},
{0x540000A0, 0xB4000080, 0xB5000060, 0x36280040, 0x37280020, 0x14000000, 0x97ffffff}
));
// Basic control flow
SINGLE_COMPARE(br(x0), 0xD61F0000);
SINGLE_COMPARE(blr(x0), 0xD63F0000);
SINGLE_COMPARE(ret(), 0xD65F03C0);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "StackOps")
{
SINGLE_COMPARE(mov(x0, sp), 0x910003E0);
SINGLE_COMPARE(mov(sp, x0), 0x9100001F);
SINGLE_COMPARE(add(sp, sp, 4), 0x910013FF);
SINGLE_COMPARE(sub(sp, sp, 4), 0xD10013FF);
SINGLE_COMPARE(add(x0, sp, 4), 0x910013E0);
SINGLE_COMPARE(sub(sp, x0, 4), 0xD100101F);
SINGLE_COMPARE(ldr(x0, mem(sp, 8)), 0xF94007E0);
SINGLE_COMPARE(str(x0, mem(sp, 8)), 0xF90007E0);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Constants")
{
// clang-format off
CHECK(check(
[](AssemblyBuilderA64& build) {
char arr[12] = "hello world";
build.adr(x0, arr, 12);
build.adr(x0, uint64_t(0x1234567887654321));
build.adr(x0, 1.0);
},
{
0x10ffffa0, 0x10ffff20, 0x10fffec0
},
{
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f,
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
0x00, 0x00, 0x00, 0x00, // 4b padding to align double
'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0,
}));
// clang-format on
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOfLabel")
{
// clang-format off
CHECK(check(
[](AssemblyBuilderA64& build) {
Label label;
build.adr(x0, label);
build.add(x0, x0, x0);
build.setLabel(label);
},
{
0x10000040, 0x8b000000,
}));
// clang-format on
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPBasic")
{
SINGLE_COMPARE(fmov(d0, d1), 0x1E604020);
SINGLE_COMPARE(fmov(d0, x1), 0x9E670020);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPMath")
{
SINGLE_COMPARE(fabs(d1, d2), 0x1E60C041);
SINGLE_COMPARE(fadd(d1, d2, d3), 0x1E632841);
SINGLE_COMPARE(fadd(s29, s29, s28), 0x1E3C2BBD);
SINGLE_COMPARE(fdiv(d1, d2, d3), 0x1E631841);
SINGLE_COMPARE(fdiv(s29, s29, s28), 0x1E3C1BBD);
SINGLE_COMPARE(fmul(d1, d2, d3), 0x1E630841);
SINGLE_COMPARE(fmul(s29, s29, s28), 0x1E3C0BBD);
SINGLE_COMPARE(fneg(d1, d2), 0x1E614041);
SINGLE_COMPARE(fneg(s30, s30), 0x1E2143DE);
SINGLE_COMPARE(fsqrt(d1, d2), 0x1E61C041);
SINGLE_COMPARE(fsub(d1, d2, d3), 0x1E633841);
SINGLE_COMPARE(fsub(s29, s29, s28), 0x1E3C3BBD);
SINGLE_COMPARE(faddp(s29, s28), 0x7E30DB9D);
SINGLE_COMPARE(faddp(d29, d28), 0x7E70DB9D);
SINGLE_COMPARE(frinta(d1, d2), 0x1E664041);
SINGLE_COMPARE(frintm(d1, d2), 0x1E654041);
SINGLE_COMPARE(frintp(d1, d2), 0x1E64C041);
SINGLE_COMPARE(fcvt(s1, d2), 0x1E624041);
SINGLE_COMPARE(fcvt(d1, s2), 0x1E22C041);
SINGLE_COMPARE(fcvtzs(w1, d2), 0x1E780041);
SINGLE_COMPARE(fcvtzs(x1, d2), 0x9E780041);
SINGLE_COMPARE(fcvtzu(w1, d2), 0x1E790041);
SINGLE_COMPARE(fcvtzu(x1, d2), 0x9E790041);
SINGLE_COMPARE(scvtf(d1, w2), 0x1E620041);
SINGLE_COMPARE(scvtf(d1, x2), 0x9E620041);
SINGLE_COMPARE(ucvtf(d1, w2), 0x1E630041);
SINGLE_COMPARE(ucvtf(d1, x2), 0x9E630041);
CHECK(check(
[](AssemblyBuilderA64& build)
{
build.fjcvtzs(w1, d2);
},
{0x1E7E0041},
{},
A64::Feature_JSCVT
));
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPLoadStore")
{
// address forms
SINGLE_COMPARE(ldr(d0, x1), 0xFD400020);
SINGLE_COMPARE(ldr(d0, mem(x1, 8)), 0xFD400420);
SINGLE_COMPARE(ldr(d0, mem(x1, x7)), 0xFC676820);
SINGLE_COMPARE(ldr(d0, mem(x1, -7)), 0xFC5F9020);
SINGLE_COMPARE(str(d0, x1), 0xFD000020);
SINGLE_COMPARE(str(d0, mem(x1, 8)), 0xFD000420);
SINGLE_COMPARE(str(d0, mem(x1, x7)), 0xFC276820);
SINGLE_COMPARE(str(d0, mem(x1, -7)), 0xFC1F9020);
// load/store sizes
SINGLE_COMPARE(ldr(s0, x1), 0xBD400020);
SINGLE_COMPARE(ldr(d0, x1), 0xFD400020);
SINGLE_COMPARE(ldr(q0, x1), 0x3DC00020);
SINGLE_COMPARE(str(s0, x1), 0xBD000020);
SINGLE_COMPARE(str(d0, x1), 0xFD000020);
SINGLE_COMPARE(str(q0, x1), 0x3D800020);
// load/store sizes x offset scaling
SINGLE_COMPARE(ldr(q0, mem(x1, 16)), 0x3DC00420);
SINGLE_COMPARE(ldr(d0, mem(x1, 16)), 0xFD400820);
SINGLE_COMPARE(ldr(s0, mem(x1, 16)), 0xBD401020);
SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
SINGLE_COMPARE(str(d0, mem(x1, 16)), 0xFD000820);
SINGLE_COMPARE(str(s0, mem(x1, 16)), 0xBD001020);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPInsertExtract")
{
SINGLE_COMPARE(ins_4s(q29, w17, 3), 0x4E1C1E3D);
SINGLE_COMPARE(ins_4s(q31, 0, q29, 0), 0x6E0407BF);
SINGLE_COMPARE(dup_4s(s29, q31, 2), 0x5E1407FD);
SINGLE_COMPARE(dup_4s(q29, q30, 0), 0x4E0407DD);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPCompare")
{
SINGLE_COMPARE(fcmp(d0, d1), 0x1E612000);
SINGLE_COMPARE(fcmpz(d1), 0x1E602028);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPImm")
{
SINGLE_COMPARE(fmov(d0, 0), 0x2F00E400);
SINGLE_COMPARE(fmov(d0, 0.125), 0x1E681000);
SINGLE_COMPARE(fmov(d0, -0.125), 0x1E781000);
SINGLE_COMPARE(fmov(d0, 1.9375), 0x1E6FF000);
SINGLE_COMPARE(fmov(q0, 0), 0x4F000400);
SINGLE_COMPARE(fmov(q0, 0.125), 0x4F02F400);
SINGLE_COMPARE(fmov(q0, -0.125), 0x4F06F400);
SINGLE_COMPARE(fmov(q0, 1.9375), 0x4F03F7E0);
CHECK(!AssemblyBuilderA64::isFmovSupported(-0.0));
CHECK(!AssemblyBuilderA64::isFmovSupported(0.12389));
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOffsetSize")
{
SINGLE_COMPARE(ldr(w0, mem(x1, 16)), 0xB9401020);
SINGLE_COMPARE(ldr(x0, mem(x1, 16)), 0xF9400820);
SINGLE_COMPARE(ldr(d0, mem(x1, 16)), 0xFD400820);
SINGLE_COMPARE(ldr(q0, mem(x1, 16)), 0x3DC00420);
SINGLE_COMPARE(str(w0, mem(x1, 16)), 0xB9001020);
SINGLE_COMPARE(str(x0, mem(x1, 16)), 0xF9000820);
SINGLE_COMPARE(str(d0, mem(x1, 16)), 0xFD000820);
SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Conditionals")
{
SINGLE_COMPARE(csel(x0, x1, x2, ConditionA64::Equal), 0x9A820020);
SINGLE_COMPARE(csel(w0, w1, w2, ConditionA64::Equal), 0x1A820020);
SINGLE_COMPARE(fcsel(d0, d1, d2, ConditionA64::Equal), 0x1E620C20);
SINGLE_COMPARE(cset(x1, ConditionA64::Less), 0x9A9FA7E1);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Undefined")
{
SINGLE_COMPARE(udf(), 0x00000000);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing")
{
SINGLE_COMPARE(ldr(x0, mem(x1, 1)), 0xF8401020);
SINGLE_COMPARE(ldr(x0, mem(x1, 1, AddressKindA64::pre)), 0xF8401C20);
SINGLE_COMPARE(ldr(x0, mem(x1, 1, AddressKindA64::post)), 0xF8401420);
SINGLE_COMPARE(ldr(q0, mem(x1, 1)), 0x3CC01020);
SINGLE_COMPARE(ldr(q0, mem(x1, 1, AddressKindA64::pre)), 0x3CC01C20);
SINGLE_COMPARE(ldr(q0, mem(x1, 1, AddressKindA64::post)), 0x3CC01420);
SINGLE_COMPARE(str(x0, mem(x1, 1)), 0xF8001020);
SINGLE_COMPARE(str(x0, mem(x1, 1, AddressKindA64::pre)), 0xF8001C20);
SINGLE_COMPARE(str(x0, mem(x1, 1, AddressKindA64::post)), 0xF8001420);
SINGLE_COMPARE(str(q0, mem(x1, 1)), 0x3C801020);
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::pre)), 0x3C801C20);
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420);
}
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath")
{
SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420);
SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420);
SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20);
SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20);
SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820);
}
TEST_CASE("LogTest")
{
AssemblyBuilderA64 build(/* logText= */ true);
build.add(sp, sp, 4);
build.add(w0, w1, w2);
build.add(x0, x1, x2, 2);
build.add(x0, x1, x2, -2);
build.add(w7, w8, 5);
build.add(x7, x8, 5);
build.ldr(x7, x8);
build.ldr(x7, mem(x8, 8));
build.ldr(x7, mem(x8, x9));
build.mov(x1, x2);
build.movk(x1, 42, 16);
build.cmp(x1, x2);
build.blr(x0);
Label l;
build.b(ConditionA64::Plus, l);
build.cbz(x7, l);
build.ldp(x0, x1, mem(x8, 8));
build.adr(x0, l);
build.fabs(d1, d2);
build.ldr(q1, x2);
build.csel(x0, x1, x2, ConditionA64::Equal);
build.cset(x0, ConditionA64::Equal);
build.fcmp(d0, d1);
build.fcmpz(d0);
build.fmov(d0, 0.25);
build.tbz(x0, 5, l);
build.fcvt(s1, d2);
build.ubfx(x1, x2, 37, 5);
build.ldr(x0, mem(x1, 1));
build.ldr(x0, mem(x1, 1, AddressKindA64::pre));
build.ldr(x0, mem(x1, 1, AddressKindA64::post));
build.add(x1, x2, w3, 3);
build.ins_4s(q29, w17, 3);
build.ins_4s(q31, 1, q29, 2);
build.dup_4s(s29, q31, 2);
build.dup_4s(q29, q30, 0);
build.fmul(q0, q1, q2);
build.setLabel(l);
build.ret();
build.finalize();
std::string expected = R"(
add sp,sp,#4
add w0,w1,w2
add x0,x1,x2 LSL #2
add x0,x1,x2 LSR #2
add w7,w8,#5
add x7,x8,#5
ldr x7,[x8]
ldr x7,[x8,#8]
ldr x7,[x8,x9]
mov x1,x2
movk x1,#42 LSL #16
cmp x1,x2
blr x0
b.pl .L1
cbz x7,.L1
ldp x0,x1,[x8,#8]
adr x0,.L1
fabs d1,d2
ldr q1,[x2]
csel x0,x1,x2,eq
cset x0,eq
fcmp d0,d1
fcmp d0,#0
fmov d0,#0.25
tbz x0,#5,.L1
fcvt s1,d2
ubfx x1,x2,#3705
ldr x0,[x1,#1]
ldr x0,[x1,#1]!
ldr x0,[x1]!,#1
add x1,x2,w3 UXTW #3
ins v29.s[3],w17
ins v31.s[1],v29.s[2]
dup s29,v31.s[2]
dup v29.4s,v30.s[0]
fmul v0.4s,v1.4s,v2.4s
.L1:
ret
)";
CHECK("\n" + build.text == expected);
}
TEST_SUITE_END();