2022-11-05 01:33:22 +08:00
|
|
|
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
|
|
|
#include "Luau/AssemblyBuilderA64.h"
|
|
|
|
#include "Luau/StringUtils.h"
|
2024-03-16 07:37:39 +08:00
|
|
|
#include "ScopedFlags.h"
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
#include "doctest.h"
|
|
|
|
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
using namespace Luau::CodeGen;
|
2023-03-04 04:21:14 +08:00
|
|
|
using namespace Luau::CodeGen::A64;
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
static std::string bytecodeAsArray(const std::vector<uint8_t>& bytecode)
|
|
|
|
{
|
|
|
|
std::string result = "{";
|
|
|
|
|
|
|
|
for (size_t i = 0; i < bytecode.size(); i++)
|
|
|
|
Luau::formatAppend(result, "%s0x%02x", i == 0 ? "" : ", ", bytecode[i]);
|
|
|
|
|
|
|
|
return result.append("}");
|
|
|
|
}
|
|
|
|
|
|
|
|
static std::string bytecodeAsArray(const std::vector<uint32_t>& code)
|
|
|
|
{
|
|
|
|
std::string result = "{";
|
|
|
|
|
|
|
|
for (size_t i = 0; i < code.size(); i++)
|
|
|
|
Luau::formatAppend(result, "%s0x%08x", i == 0 ? "" : ", ", code[i]);
|
|
|
|
|
|
|
|
return result.append("}");
|
|
|
|
}
|
|
|
|
|
|
|
|
class AssemblyBuilderA64Fixture
|
|
|
|
{
|
|
|
|
public:
|
2023-04-01 02:42:49 +08:00
|
|
|
bool check(void (*f)(AssemblyBuilderA64& build), std::vector<uint32_t> code, std::vector<uint8_t> data = {}, unsigned int features = 0)
|
2022-11-05 01:33:22 +08:00
|
|
|
{
|
2023-04-01 02:42:49 +08:00
|
|
|
AssemblyBuilderA64 build(/* logText= */ false, features);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
f(build);
|
|
|
|
|
|
|
|
build.finalize();
|
|
|
|
|
|
|
|
if (build.code != code)
|
|
|
|
{
|
|
|
|
printf("Expected code: %s\nReceived code: %s\n", bytecodeAsArray(code).c_str(), bytecodeAsArray(build.code).c_str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (build.data != data)
|
|
|
|
{
|
|
|
|
printf("Expected data: %s\nReceived data: %s\n", bytecodeAsArray(data).c_str(), bytecodeAsArray(build.data).c_str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// armconverter.com can be used to validate instruction sequences
|
|
|
|
TEST_SUITE_BEGIN("A64Assembly");
|
|
|
|
|
|
|
|
#define SINGLE_COMPARE(inst, ...) \
|
|
|
|
CHECK(check( \
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build) \
|
|
|
|
{ \
|
2022-11-05 01:33:22 +08:00
|
|
|
build.inst; \
|
|
|
|
}, \
|
2024-08-02 22:30:04 +08:00
|
|
|
{__VA_ARGS__} \
|
|
|
|
))
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Unary")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(neg(x0, x1), 0xCB0103E0);
|
|
|
|
SINGLE_COMPARE(neg(w0, w1), 0x4B0103E0);
|
2023-05-13 01:50:47 +08:00
|
|
|
SINGLE_COMPARE(mvn_(x0, x1), 0xAA2103E0);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
SINGLE_COMPARE(clz(x0, x1), 0xDAC01020);
|
|
|
|
SINGLE_COMPARE(clz(w0, w1), 0x5AC01020);
|
|
|
|
SINGLE_COMPARE(rbit(x0, x1), 0xDAC00020);
|
|
|
|
SINGLE_COMPARE(rbit(w0, w1), 0x5AC00020);
|
2023-10-21 09:10:30 +08:00
|
|
|
SINGLE_COMPARE(rev(w0, w1), 0x5AC00820);
|
|
|
|
SINGLE_COMPARE(rev(x0, x1), 0xDAC00C20);
|
2022-11-05 01:33:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Binary")
|
|
|
|
{
|
|
|
|
// reg, reg
|
|
|
|
SINGLE_COMPARE(add(x0, x1, x2), 0x8B020020);
|
|
|
|
SINGLE_COMPARE(add(w0, w1, w2), 0x0B020020);
|
|
|
|
SINGLE_COMPARE(add(x0, x1, x2, 7), 0x8B021C20);
|
2023-04-22 06:14:26 +08:00
|
|
|
SINGLE_COMPARE(add(x0, x1, x2, -7), 0x8B421C20);
|
2022-11-05 01:33:22 +08:00
|
|
|
SINGLE_COMPARE(sub(x0, x1, x2), 0xCB020020);
|
|
|
|
SINGLE_COMPARE(and_(x0, x1, x2), 0x8A020020);
|
2023-04-22 06:14:26 +08:00
|
|
|
SINGLE_COMPARE(and_(x0, x1, x2, 7), 0x8A021C20);
|
|
|
|
SINGLE_COMPARE(and_(x0, x1, x2, -7), 0x8A421C20);
|
2023-04-15 02:06:22 +08:00
|
|
|
SINGLE_COMPARE(bic(x0, x1, x2), 0x8A220020);
|
2022-11-05 01:33:22 +08:00
|
|
|
SINGLE_COMPARE(orr(x0, x1, x2), 0xAA020020);
|
|
|
|
SINGLE_COMPARE(eor(x0, x1, x2), 0xCA020020);
|
|
|
|
SINGLE_COMPARE(lsl(x0, x1, x2), 0x9AC22020);
|
|
|
|
SINGLE_COMPARE(lsl(w0, w1, w2), 0x1AC22020);
|
|
|
|
SINGLE_COMPARE(lsr(x0, x1, x2), 0x9AC22420);
|
|
|
|
SINGLE_COMPARE(asr(x0, x1, x2), 0x9AC22820);
|
|
|
|
SINGLE_COMPARE(ror(x0, x1, x2), 0x9AC22C20);
|
2022-11-11 06:53:13 +08:00
|
|
|
SINGLE_COMPARE(cmp(x0, x1), 0xEB01001F);
|
2023-04-15 02:06:22 +08:00
|
|
|
SINGLE_COMPARE(tst(x0, x1), 0xEA01001F);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// reg, imm
|
|
|
|
SINGLE_COMPARE(add(x3, x7, 78), 0x910138E3);
|
|
|
|
SINGLE_COMPARE(add(w3, w7, 78), 0x110138E3);
|
|
|
|
SINGLE_COMPARE(sub(w3, w7, 78), 0x510138E3);
|
2022-11-11 06:53:13 +08:00
|
|
|
SINGLE_COMPARE(cmp(w0, 42), 0x7100A81F);
|
2022-11-05 01:33:22 +08:00
|
|
|
}
|
|
|
|
|
2023-09-02 01:58:27 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryExtended")
|
|
|
|
{
|
|
|
|
// reg, reg
|
|
|
|
SINGLE_COMPARE(add(x0, x1, w2, 3), 0x8B224C20);
|
|
|
|
SINGLE_COMPARE(sub(x0, x1, w2, 3), 0xCB224C20);
|
|
|
|
}
|
|
|
|
|
2023-04-15 02:06:22 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "BinaryImm")
|
|
|
|
{
|
|
|
|
// instructions
|
|
|
|
SINGLE_COMPARE(and_(w1, w2, 1), 0x12000041);
|
|
|
|
SINGLE_COMPARE(orr(w1, w2, 1), 0x32000041);
|
|
|
|
SINGLE_COMPARE(eor(w1, w2, 1), 0x52000041);
|
|
|
|
SINGLE_COMPARE(tst(w1, 1), 0x7200003f);
|
|
|
|
|
|
|
|
// various mask forms
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 1), 0x12000000);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 3), 0x12000400);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 7), 0x12000800);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 2147483647), 0x12007800);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 6), 0x121F0400);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 12), 0x121E0400);
|
|
|
|
SINGLE_COMPARE(and_(w0, w0, 2147483648), 0x12010000);
|
2023-04-22 06:14:26 +08:00
|
|
|
|
|
|
|
// shifts
|
|
|
|
SINGLE_COMPARE(lsl(w1, w2, 1), 0x531F7841);
|
|
|
|
SINGLE_COMPARE(lsl(x1, x2, 1), 0xD37FF841);
|
|
|
|
SINGLE_COMPARE(lsr(w1, w2, 1), 0x53017C41);
|
|
|
|
SINGLE_COMPARE(lsr(x1, x2, 1), 0xD341FC41);
|
|
|
|
SINGLE_COMPARE(asr(w1, w2, 1), 0x13017C41);
|
|
|
|
SINGLE_COMPARE(asr(x1, x2, 1), 0x9341FC41);
|
|
|
|
SINGLE_COMPARE(ror(w1, w2, 1), 0x13820441);
|
|
|
|
SINGLE_COMPARE(ror(x1, x2, 1), 0x93C20441);
|
2023-04-15 02:06:22 +08:00
|
|
|
}
|
|
|
|
|
2023-05-20 03:37:30 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Bitfield")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(ubfiz(x1, x2, 37, 5), 0xD35B1041);
|
|
|
|
SINGLE_COMPARE(ubfx(x1, x2, 37, 5), 0xD365A441);
|
|
|
|
SINGLE_COMPARE(sbfiz(x1, x2, 37, 5), 0x935B1041);
|
|
|
|
SINGLE_COMPARE(sbfx(x1, x2, 37, 5), 0x9365A441);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(ubfiz(w1, w2, 17, 5), 0x530F1041);
|
|
|
|
SINGLE_COMPARE(ubfx(w1, w2, 17, 5), 0x53115441);
|
|
|
|
SINGLE_COMPARE(sbfiz(w1, w2, 17, 5), 0x130F1041);
|
|
|
|
SINGLE_COMPARE(sbfx(w1, w2, 17, 5), 0x13115441);
|
|
|
|
}
|
|
|
|
|
2022-11-05 01:33:22 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Loads")
|
|
|
|
{
|
|
|
|
// address forms
|
|
|
|
SINGLE_COMPARE(ldr(x0, x1), 0xF9400020);
|
2022-11-11 06:53:13 +08:00
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 8)), 0xF9400420);
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, x7)), 0xF8676820);
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, -7)), 0xF85F9020);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// load sizes
|
|
|
|
SINGLE_COMPARE(ldr(x0, x1), 0xF9400020);
|
|
|
|
SINGLE_COMPARE(ldr(w0, x1), 0xB9400020);
|
|
|
|
SINGLE_COMPARE(ldrb(w0, x1), 0x39400020);
|
|
|
|
SINGLE_COMPARE(ldrh(w0, x1), 0x79400020);
|
|
|
|
SINGLE_COMPARE(ldrsb(x0, x1), 0x39800020);
|
|
|
|
SINGLE_COMPARE(ldrsb(w0, x1), 0x39C00020);
|
|
|
|
SINGLE_COMPARE(ldrsh(x0, x1), 0x79800020);
|
|
|
|
SINGLE_COMPARE(ldrsh(w0, x1), 0x79C00020);
|
|
|
|
SINGLE_COMPARE(ldrsw(x0, x1), 0xB9800020);
|
2023-03-25 02:03:04 +08:00
|
|
|
|
2023-04-22 06:14:26 +08:00
|
|
|
// load sizes x offset scaling
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 8)), 0xF9400420);
|
|
|
|
SINGLE_COMPARE(ldr(w0, mem(x1, 8)), 0xB9400820);
|
|
|
|
SINGLE_COMPARE(ldrb(w0, mem(x1, 8)), 0x39402020);
|
|
|
|
SINGLE_COMPARE(ldrh(w0, mem(x1, 8)), 0x79401020);
|
|
|
|
SINGLE_COMPARE(ldrsb(w0, mem(x1, 8)), 0x39C02020);
|
|
|
|
SINGLE_COMPARE(ldrsh(w0, mem(x1, 8)), 0x79C01020);
|
|
|
|
|
2023-03-25 02:03:04 +08:00
|
|
|
// paired loads
|
|
|
|
SINGLE_COMPARE(ldp(x0, x1, mem(x2, 8)), 0xA9408440);
|
|
|
|
SINGLE_COMPARE(ldp(w0, w1, mem(x2, -8)), 0x297F0440);
|
2022-11-05 01:33:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Stores")
|
|
|
|
{
|
|
|
|
// address forms
|
|
|
|
SINGLE_COMPARE(str(x0, x1), 0xF9000020);
|
2022-11-11 06:53:13 +08:00
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 8)), 0xF9000420);
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, x7)), 0xF8276820);
|
|
|
|
SINGLE_COMPARE(strh(w0, mem(x1, -7)), 0x781F9020);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// store sizes
|
|
|
|
SINGLE_COMPARE(str(x0, x1), 0xF9000020);
|
|
|
|
SINGLE_COMPARE(str(w0, x1), 0xB9000020);
|
|
|
|
SINGLE_COMPARE(strb(w0, x1), 0x39000020);
|
|
|
|
SINGLE_COMPARE(strh(w0, x1), 0x79000020);
|
2023-03-25 02:03:04 +08:00
|
|
|
|
2023-04-22 06:14:26 +08:00
|
|
|
// store sizes x offset scaling
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 8)), 0xF9000420);
|
|
|
|
SINGLE_COMPARE(str(w0, mem(x1, 8)), 0xB9000820);
|
|
|
|
SINGLE_COMPARE(strb(w0, mem(x1, 8)), 0x39002020);
|
|
|
|
SINGLE_COMPARE(strh(w0, mem(x1, 8)), 0x79001020);
|
|
|
|
|
2023-03-25 02:03:04 +08:00
|
|
|
// paired stores
|
|
|
|
SINGLE_COMPARE(stp(x0, x1, mem(x2, 8)), 0xA9008440);
|
|
|
|
SINGLE_COMPARE(stp(w0, w1, mem(x2, -8)), 0x293F0440);
|
2022-11-05 01:33:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Moves")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(mov(x0, x1), 0xAA0103E0);
|
|
|
|
SINGLE_COMPARE(mov(w0, w1), 0x2A0103E0);
|
Improve A64 lowering for vector operations by using vector instructions (#1164)
This change replaces scalar versions of vector opcodes for A64 with
actual vector instructions.
We take the approach similar to X64: patch last component with zero,
perform the math, patch last component with type tag. I'm hoping that in
the future the type tag will be placed separately (separate IR opcode?)
because right now chains of math operations result in excessive type tag
operations.
To patch the type tag without always keeping a mask in a register,
ins.4s instructions can be used; unfortunately it's only capable of
patching a register in-place, so we need an extra register copy in case
it's not last-use. Usually it's last-use so the patch is free; probably
with IR rework mentioned above all of this can be improved (e.g.
load-with-patch will never need to copy).
~It's not 100% clear if we *have* to patch type tag: Apple does preserve
denormals but we'd need to benchmark this to see if there's an actual
performance impact. But for now we're playing it safe.~
This was tested by running the conformance tests, and new opcode
implementations were checked by comparing the result with
https://armconverter.com/.
Performance testing is complicated by the fact that OSS Luau doesn't
support vector constructor out of the box, and other limitations of
codegen. I've hacked vector constructor/type into REPL and confirmed
that on a test that calls this function in a loop (not inlined):
```
function fma(a: vector, b: vector, c: vector)
return a * b + c
end
```
... this PR improves performance by ~6% (note that probably most of the
overhead here is the call dispatch; I didn't want to brave testing a
more complex expression). The assembly for an individual operation
changes as follows:
Before:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
dup s29,v31.s[0]
dup s28,v30.s[0]
fmul s29,s29,s28
ins v31.s[0],v29.s[0]
dup s29,v31.s[1]
dup s28,v30.s[1]
fmul s29,s29,s28
ins v31.s[1],v29.s[0]
dup s29,v31.s[2]
dup s28,v30.s[2]
fmul s29,s29,s28
ins v31.s[2],v29.s[0]
```
After:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
ins v31.s[3],w31
ins v30.s[3],w31
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
**edit** final form (see comments):
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
2024-02-17 00:30:35 +08:00
|
|
|
SINGLE_COMPARE(mov(q0, q1), 0x4EA11C20);
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
SINGLE_COMPARE(movz(x0, 42), 0xD2800540);
|
|
|
|
SINGLE_COMPARE(movz(w0, 42), 0x52800540);
|
|
|
|
SINGLE_COMPARE(movn(x0, 42), 0x92800540);
|
|
|
|
SINGLE_COMPARE(movn(w0, 42), 0x12800540);
|
2022-11-05 01:33:22 +08:00
|
|
|
SINGLE_COMPARE(movk(x0, 42, 16), 0xF2A00540);
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, 42);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0xD2800540}
|
|
|
|
));
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, 424242);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0xD28F2640, 0xF2A000C0}
|
|
|
|
));
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, -42);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x92800520}
|
|
|
|
));
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, -424242);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x928F2620, 0xF2BFFF20}
|
|
|
|
));
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, -65536);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x929FFFE0}
|
|
|
|
));
|
2023-03-25 02:03:04 +08:00
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-03-25 02:03:04 +08:00
|
|
|
build.mov(x0, -65537);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x92800000, 0xF2BFFFC0}
|
|
|
|
));
|
2022-11-05 01:33:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "ControlFlow")
|
|
|
|
{
|
|
|
|
// Jump back
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2022-11-05 01:33:22 +08:00
|
|
|
Label start = build.setLabel();
|
|
|
|
build.mov(x0, x1);
|
|
|
|
build.b(ConditionA64::Equal, start);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0xAA0103E0, 0x54FFFFE0}
|
|
|
|
));
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// Jump forward
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2022-11-05 01:33:22 +08:00
|
|
|
Label skip;
|
|
|
|
build.b(ConditionA64::Equal, skip);
|
|
|
|
build.mov(x0, x1);
|
|
|
|
build.setLabel(skip);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x54000040, 0xAA0103E0}
|
|
|
|
));
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// Jumps
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2022-11-05 01:33:22 +08:00
|
|
|
Label skip;
|
|
|
|
build.b(ConditionA64::Equal, skip);
|
|
|
|
build.cbz(x0, skip);
|
|
|
|
build.cbnz(x0, skip);
|
2023-04-22 06:14:26 +08:00
|
|
|
build.tbz(x0, 5, skip);
|
|
|
|
build.tbnz(x0, 5, skip);
|
2022-11-05 01:33:22 +08:00
|
|
|
build.setLabel(skip);
|
2022-11-11 06:53:13 +08:00
|
|
|
build.b(skip);
|
2023-04-22 06:14:26 +08:00
|
|
|
build.bl(skip);
|
2022-11-05 01:33:22 +08:00
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x540000A0, 0xB4000080, 0xB5000060, 0x36280040, 0x37280020, 0x14000000, 0x97ffffff}
|
|
|
|
));
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
// Basic control flow
|
2022-11-11 06:53:13 +08:00
|
|
|
SINGLE_COMPARE(br(x0), 0xD61F0000);
|
|
|
|
SINGLE_COMPARE(blr(x0), 0xD63F0000);
|
2022-11-05 01:33:22 +08:00
|
|
|
SINGLE_COMPARE(ret(), 0xD65F03C0);
|
|
|
|
}
|
|
|
|
|
2022-11-11 06:53:13 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "StackOps")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(mov(x0, sp), 0x910003E0);
|
|
|
|
SINGLE_COMPARE(mov(sp, x0), 0x9100001F);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(add(sp, sp, 4), 0x910013FF);
|
|
|
|
SINGLE_COMPARE(sub(sp, sp, 4), 0xD10013FF);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(add(x0, sp, 4), 0x910013E0);
|
|
|
|
SINGLE_COMPARE(sub(sp, x0, 4), 0xD100101F);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(sp, 8)), 0xF94007E0);
|
|
|
|
SINGLE_COMPARE(str(x0, mem(sp, 8)), 0xF90007E0);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Constants")
|
|
|
|
{
|
|
|
|
// clang-format off
|
|
|
|
CHECK(check(
|
|
|
|
[](AssemblyBuilderA64& build) {
|
|
|
|
char arr[12] = "hello world";
|
|
|
|
build.adr(x0, arr, 12);
|
|
|
|
build.adr(x0, uint64_t(0x1234567887654321));
|
|
|
|
build.adr(x0, 1.0);
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0x10ffffa0, 0x10ffff20, 0x10fffec0
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f,
|
|
|
|
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
|
|
|
|
0x00, 0x00, 0x00, 0x00, // 4b padding to align double
|
|
|
|
'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', 0x0,
|
|
|
|
}));
|
|
|
|
// clang-format on
|
|
|
|
}
|
|
|
|
|
2023-03-25 02:03:04 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOfLabel")
|
|
|
|
{
|
|
|
|
// clang-format off
|
|
|
|
CHECK(check(
|
|
|
|
[](AssemblyBuilderA64& build) {
|
|
|
|
Label label;
|
|
|
|
build.adr(x0, label);
|
|
|
|
build.add(x0, x0, x0);
|
|
|
|
build.setLabel(label);
|
|
|
|
},
|
|
|
|
{
|
|
|
|
0x10000040, 0x8b000000,
|
|
|
|
}));
|
|
|
|
// clang-format on
|
|
|
|
}
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPBasic")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(fmov(d0, d1), 0x1E604020);
|
2023-05-13 01:50:47 +08:00
|
|
|
SINGLE_COMPARE(fmov(d0, x1), 0x9E670020);
|
2023-04-01 02:42:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPMath")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(fabs(d1, d2), 0x1E60C041);
|
|
|
|
SINGLE_COMPARE(fadd(d1, d2, d3), 0x1E632841);
|
2024-01-27 11:20:56 +08:00
|
|
|
SINGLE_COMPARE(fadd(s29, s29, s28), 0x1E3C2BBD);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(fdiv(d1, d2, d3), 0x1E631841);
|
2024-01-27 11:20:56 +08:00
|
|
|
SINGLE_COMPARE(fdiv(s29, s29, s28), 0x1E3C1BBD);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(fmul(d1, d2, d3), 0x1E630841);
|
2024-01-27 11:20:56 +08:00
|
|
|
SINGLE_COMPARE(fmul(s29, s29, s28), 0x1E3C0BBD);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(fneg(d1, d2), 0x1E614041);
|
2024-01-27 11:20:56 +08:00
|
|
|
SINGLE_COMPARE(fneg(s30, s30), 0x1E2143DE);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(fsqrt(d1, d2), 0x1E61C041);
|
|
|
|
SINGLE_COMPARE(fsub(d1, d2, d3), 0x1E633841);
|
2024-01-27 11:20:56 +08:00
|
|
|
SINGLE_COMPARE(fsub(s29, s29, s28), 0x1E3C3BBD);
|
2023-04-01 02:42:49 +08:00
|
|
|
|
CodeGen: Rewrite dot product lowering using a dedicated IR instruction (#1512)
Instead of doing the dot product related math in scalar IR, we lift the
computation into a dedicated IR instruction.
On x64, we can use VDPPS which was more or less tailor made for this
purpose. This is better than manual scalar lowering that requires
reloading components from memory; it's not always a strict improvement
over the shuffle+add version (which we never had), but this can now be
adjusted in the IR lowering in an optimal fashion (maybe even based on
CPU vendor, although that'd create issues for offline compilation).
On A64, we can either use naive adds or paired adds, as there is no
dedicated vector-wide horizontal instruction until SVE. Both run at
about the same performance on M2, but paired adds require fewer
instructions and temporaries.
I've measured this using mesh-normal-vector benchmark, changing the
benchmark to just report the time of the second loop inside
`calculate_normals`, testing master vs #1504 vs this PR, also increasing
the grid size to 400 for more stable timings.
On Zen 4 (7950X), this PR is comfortably ~8% faster vs master, while I
see neutral to negative results in #1504.
On M2 (base), this PR is ~28% faster vs master, while #1504 is only
about ~10% faster.
If I measure the second loop in `calculate_tangent_space` instead, I
get:
On Zen 4 (7950X), this PR is ~12% faster vs master, while #1504 is ~3%
faster
On M2 (base), this PR is ~24% faster vs master, while #1504 is only
about ~13% faster.
Note that the loops in question are not quite optimal, as they store and
reload various vectors to dictionary values due to inappropriate use of
locals. The underlying gains in individual functions are thus larger
than the numbers above; for example, changing the `calculate_normals`
loop to use a local variable to store the normalized vector (but still
saving the result to dictionary value), I get a ~24% performance
increase from this PR on Zen4 vs master instead of just 8% (#1504 is
~15% slower in this setup).
2024-11-09 08:23:09 +08:00
|
|
|
SINGLE_COMPARE(faddp(s29, s28), 0x7E30DB9D);
|
|
|
|
SINGLE_COMPARE(faddp(d29, d28), 0x7E70DB9D);
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(frinta(d1, d2), 0x1E664041);
|
|
|
|
SINGLE_COMPARE(frintm(d1, d2), 0x1E654041);
|
|
|
|
SINGLE_COMPARE(frintp(d1, d2), 0x1E64C041);
|
|
|
|
|
2023-04-29 03:55:13 +08:00
|
|
|
SINGLE_COMPARE(fcvt(s1, d2), 0x1E624041);
|
|
|
|
SINGLE_COMPARE(fcvt(d1, s2), 0x1E22C041);
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(fcvtzs(w1, d2), 0x1E780041);
|
|
|
|
SINGLE_COMPARE(fcvtzs(x1, d2), 0x9E780041);
|
|
|
|
SINGLE_COMPARE(fcvtzu(w1, d2), 0x1E790041);
|
|
|
|
SINGLE_COMPARE(fcvtzu(x1, d2), 0x9E790041);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(scvtf(d1, w2), 0x1E620041);
|
|
|
|
SINGLE_COMPARE(scvtf(d1, x2), 0x9E620041);
|
|
|
|
SINGLE_COMPARE(ucvtf(d1, w2), 0x1E630041);
|
|
|
|
SINGLE_COMPARE(ucvtf(d1, x2), 0x9E630041);
|
|
|
|
|
|
|
|
CHECK(check(
|
2024-08-02 22:30:04 +08:00
|
|
|
[](AssemblyBuilderA64& build)
|
|
|
|
{
|
2023-04-01 02:42:49 +08:00
|
|
|
build.fjcvtzs(w1, d2);
|
|
|
|
},
|
2024-08-02 22:30:04 +08:00
|
|
|
{0x1E7E0041},
|
|
|
|
{},
|
|
|
|
A64::Feature_JSCVT
|
|
|
|
));
|
2023-04-01 02:42:49 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPLoadStore")
|
|
|
|
{
|
|
|
|
// address forms
|
|
|
|
SINGLE_COMPARE(ldr(d0, x1), 0xFD400020);
|
|
|
|
SINGLE_COMPARE(ldr(d0, mem(x1, 8)), 0xFD400420);
|
|
|
|
SINGLE_COMPARE(ldr(d0, mem(x1, x7)), 0xFC676820);
|
|
|
|
SINGLE_COMPARE(ldr(d0, mem(x1, -7)), 0xFC5F9020);
|
|
|
|
SINGLE_COMPARE(str(d0, x1), 0xFD000020);
|
|
|
|
SINGLE_COMPARE(str(d0, mem(x1, 8)), 0xFD000420);
|
|
|
|
SINGLE_COMPARE(str(d0, mem(x1, x7)), 0xFC276820);
|
|
|
|
SINGLE_COMPARE(str(d0, mem(x1, -7)), 0xFC1F9020);
|
|
|
|
|
|
|
|
// load/store sizes
|
2023-04-29 03:55:13 +08:00
|
|
|
SINGLE_COMPARE(ldr(s0, x1), 0xBD400020);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(ldr(d0, x1), 0xFD400020);
|
|
|
|
SINGLE_COMPARE(ldr(q0, x1), 0x3DC00020);
|
2023-04-29 03:55:13 +08:00
|
|
|
SINGLE_COMPARE(str(s0, x1), 0xBD000020);
|
2023-04-01 02:42:49 +08:00
|
|
|
SINGLE_COMPARE(str(d0, x1), 0xFD000020);
|
|
|
|
SINGLE_COMPARE(str(q0, x1), 0x3D800020);
|
2023-04-22 06:14:26 +08:00
|
|
|
|
|
|
|
// load/store sizes x offset scaling
|
|
|
|
SINGLE_COMPARE(ldr(q0, mem(x1, 16)), 0x3DC00420);
|
|
|
|
SINGLE_COMPARE(ldr(d0, mem(x1, 16)), 0xFD400820);
|
2023-04-29 03:55:13 +08:00
|
|
|
SINGLE_COMPARE(ldr(s0, mem(x1, 16)), 0xBD401020);
|
2023-04-22 06:14:26 +08:00
|
|
|
SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
|
|
|
|
SINGLE_COMPARE(str(d0, mem(x1, 16)), 0xFD000820);
|
2023-04-29 03:55:13 +08:00
|
|
|
SINGLE_COMPARE(str(s0, mem(x1, 16)), 0xBD001020);
|
2023-04-01 02:42:49 +08:00
|
|
|
}
|
|
|
|
|
2024-01-27 11:20:56 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPInsertExtract")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(ins_4s(q29, w17, 3), 0x4E1C1E3D);
|
|
|
|
SINGLE_COMPARE(ins_4s(q31, 0, q29, 0), 0x6E0407BF);
|
|
|
|
SINGLE_COMPARE(dup_4s(s29, q31, 2), 0x5E1407FD);
|
|
|
|
SINGLE_COMPARE(dup_4s(q29, q30, 0), 0x4E0407DD);
|
|
|
|
}
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPCompare")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(fcmp(d0, d1), 0x1E612000);
|
|
|
|
SINGLE_COMPARE(fcmpz(d1), 0x1E602028);
|
|
|
|
}
|
|
|
|
|
2023-04-22 06:14:26 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "FPImm")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(fmov(d0, 0), 0x2F00E400);
|
|
|
|
SINGLE_COMPARE(fmov(d0, 0.125), 0x1E681000);
|
|
|
|
SINGLE_COMPARE(fmov(d0, -0.125), 0x1E781000);
|
CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)
When the input is a constant, we use a fairly inefficient sequence of
fmov+fcvt+dup or, when the double isn't encodable in fmov,
adr+ldr+fcvt+dup.
Instead, we can use the same lowering as X64 when the input is a
constant, and load the vector from memory. However, if the constant is
encodable via fmov, we can use a vector fmov instead (which is just one
instruction and doesn't need constant space).
Fortunately the bit encoding of fmov for 32-bit floating point numbers
matches that of 64-bit: the decoding algorithm is a little different
because it expands into a larger exponent, but the values are
compatible, so if a double can be encoded into a scalar fmov with a
given abcdefgh pattern, the same pattern should encode the same float;
due to the very limited number of mantissa and exponent bits, all values
that are encodable are also exact in both 32-bit and 64-bit floats.
This strategy is ~same as what gcc uses. For complex vectors, we
previously used 4 instructions and 8 bytes of constant storage, and now
we use 2 instructions and 16 bytes of constant storage, so the memory
footprint is the same; for simple vectors we just need 1 instruction (4
bytes).
clang lowers vector constants a little differently, opting to synthesize
a 64-bit integer using 4 instructions (mov/movk) and then move it to the
vector register - this requires 5 instructions and 20 bytes, vs ours/gcc
2 instructions and 8+16=24 bytes. I tried a simpler version of this that
would be more compact - synthesize a 32-bit integer constant with
mov+movk, and move it to vector register via dup.4s - but this was a
little slower on M2, so for now we prefer the slightly larger version as
it's not a regression vs current implementation.
On the vector approximation benchmark we get:
- Before this PR (flag=false): ~7.85 ns/op
- After this PR (flag=true): ~7.74 ns/op
- After this PR, with 0.125 instead of 0.123 in the benchmark code (to
use fmov): ~7.52 ns/op
- Not part of this PR, but the mov/dup strategy described above: ~8.00
ns/op
2024-03-14 03:56:11 +08:00
|
|
|
SINGLE_COMPARE(fmov(d0, 1.9375), 0x1E6FF000);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(fmov(q0, 0), 0x4F000400);
|
|
|
|
SINGLE_COMPARE(fmov(q0, 0.125), 0x4F02F400);
|
|
|
|
SINGLE_COMPARE(fmov(q0, -0.125), 0x4F06F400);
|
|
|
|
SINGLE_COMPARE(fmov(q0, 1.9375), 0x4F03F7E0);
|
2023-04-22 06:14:26 +08:00
|
|
|
|
|
|
|
CHECK(!AssemblyBuilderA64::isFmovSupported(-0.0));
|
|
|
|
CHECK(!AssemblyBuilderA64::isFmovSupported(0.12389));
|
|
|
|
}
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "AddressOffsetSize")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(ldr(w0, mem(x1, 16)), 0xB9401020);
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 16)), 0xF9400820);
|
|
|
|
SINGLE_COMPARE(ldr(d0, mem(x1, 16)), 0xFD400820);
|
|
|
|
SINGLE_COMPARE(ldr(q0, mem(x1, 16)), 0x3DC00420);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(str(w0, mem(x1, 16)), 0xB9001020);
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 16)), 0xF9000820);
|
|
|
|
SINGLE_COMPARE(str(d0, mem(x1, 16)), 0xFD000820);
|
|
|
|
SINGLE_COMPARE(str(q0, mem(x1, 16)), 0x3D800420);
|
|
|
|
}
|
|
|
|
|
2023-04-15 02:06:22 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Conditionals")
|
2023-04-01 02:42:49 +08:00
|
|
|
{
|
|
|
|
SINGLE_COMPARE(csel(x0, x1, x2, ConditionA64::Equal), 0x9A820020);
|
|
|
|
SINGLE_COMPARE(csel(w0, w1, w2, ConditionA64::Equal), 0x1A820020);
|
|
|
|
SINGLE_COMPARE(fcsel(d0, d1, d2, ConditionA64::Equal), 0x1E620C20);
|
2023-04-15 02:06:22 +08:00
|
|
|
|
|
|
|
SINGLE_COMPARE(cset(x1, ConditionA64::Less), 0x9A9FA7E1);
|
2023-04-01 02:42:49 +08:00
|
|
|
}
|
|
|
|
|
2023-05-26 05:36:34 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "Undefined")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(udf(), 0x00000000);
|
|
|
|
}
|
|
|
|
|
2023-06-10 01:08:00 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "PrePostIndexing")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 1)), 0xF8401020);
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 1, AddressKindA64::pre)), 0xF8401C20);
|
|
|
|
SINGLE_COMPARE(ldr(x0, mem(x1, 1, AddressKindA64::post)), 0xF8401420);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(ldr(q0, mem(x1, 1)), 0x3CC01020);
|
|
|
|
SINGLE_COMPARE(ldr(q0, mem(x1, 1, AddressKindA64::pre)), 0x3CC01C20);
|
|
|
|
SINGLE_COMPARE(ldr(q0, mem(x1, 1, AddressKindA64::post)), 0x3CC01420);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 1)), 0xF8001020);
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 1, AddressKindA64::pre)), 0xF8001C20);
|
|
|
|
SINGLE_COMPARE(str(x0, mem(x1, 1, AddressKindA64::post)), 0xF8001420);
|
|
|
|
|
|
|
|
SINGLE_COMPARE(str(q0, mem(x1, 1)), 0x3C801020);
|
|
|
|
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::pre)), 0x3C801C20);
|
|
|
|
SINGLE_COMPARE(str(q0, mem(x1, 1, AddressKindA64::post)), 0x3C801420);
|
|
|
|
}
|
|
|
|
|
Improve A64 lowering for vector operations by using vector instructions (#1164)
This change replaces scalar versions of vector opcodes for A64 with
actual vector instructions.
We take the approach similar to X64: patch last component with zero,
perform the math, patch last component with type tag. I'm hoping that in
the future the type tag will be placed separately (separate IR opcode?)
because right now chains of math operations result in excessive type tag
operations.
To patch the type tag without always keeping a mask in a register,
ins.4s instructions can be used; unfortunately it's only capable of
patching a register in-place, so we need an extra register copy in case
it's not last-use. Usually it's last-use so the patch is free; probably
with IR rework mentioned above all of this can be improved (e.g.
load-with-patch will never need to copy).
~It's not 100% clear if we *have* to patch type tag: Apple does preserve
denormals but we'd need to benchmark this to see if there's an actual
performance impact. But for now we're playing it safe.~
This was tested by running the conformance tests, and new opcode
implementations were checked by comparing the result with
https://armconverter.com/.
Performance testing is complicated by the fact that OSS Luau doesn't
support vector constructor out of the box, and other limitations of
codegen. I've hacked vector constructor/type into REPL and confirmed
that on a test that calls this function in a loop (not inlined):
```
function fma(a: vector, b: vector, c: vector)
return a * b + c
end
```
... this PR improves performance by ~6% (note that probably most of the
overhead here is the call dispatch; I didn't want to brave testing a
more complex expression). The assembly for an individual operation
changes as follows:
Before:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
dup s29,v31.s[0]
dup s28,v30.s[0]
fmul s29,s29,s28
ins v31.s[0],v29.s[0]
dup s29,v31.s[1]
dup s28,v30.s[1]
fmul s29,s29,s28
ins v31.s[1],v29.s[0]
dup s29,v31.s[2]
dup s28,v30.s[2]
fmul s29,s29,s28
ins v31.s[2],v29.s[0]
```
After:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
ins v31.s[3],w31
ins v30.s[3],w31
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
**edit** final form (see comments):
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
2024-02-17 00:30:35 +08:00
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderA64Fixture, "SIMDMath")
|
|
|
|
{
|
|
|
|
SINGLE_COMPARE(fadd(q0, q1, q2), 0x4E22D420);
|
|
|
|
SINGLE_COMPARE(fsub(q0, q1, q2), 0x4EA2D420);
|
|
|
|
SINGLE_COMPARE(fmul(q0, q1, q2), 0x6E22DC20);
|
|
|
|
SINGLE_COMPARE(fdiv(q0, q1, q2), 0x6E22FC20);
|
|
|
|
SINGLE_COMPARE(fneg(q0, q1), 0x6EA0F820);
|
|
|
|
}
|
|
|
|
|
2022-11-05 01:33:22 +08:00
|
|
|
TEST_CASE("LogTest")
|
|
|
|
{
|
|
|
|
AssemblyBuilderA64 build(/* logText= */ true);
|
|
|
|
|
2022-11-11 06:53:13 +08:00
|
|
|
build.add(sp, sp, 4);
|
2022-11-05 01:33:22 +08:00
|
|
|
build.add(w0, w1, w2);
|
|
|
|
build.add(x0, x1, x2, 2);
|
2023-04-22 06:14:26 +08:00
|
|
|
build.add(x0, x1, x2, -2);
|
2022-11-05 01:33:22 +08:00
|
|
|
build.add(w7, w8, 5);
|
|
|
|
build.add(x7, x8, 5);
|
|
|
|
build.ldr(x7, x8);
|
2022-11-11 06:53:13 +08:00
|
|
|
build.ldr(x7, mem(x8, 8));
|
|
|
|
build.ldr(x7, mem(x8, x9));
|
2022-11-05 01:33:22 +08:00
|
|
|
build.mov(x1, x2);
|
|
|
|
build.movk(x1, 42, 16);
|
2022-11-11 06:53:13 +08:00
|
|
|
build.cmp(x1, x2);
|
|
|
|
build.blr(x0);
|
2022-11-05 01:33:22 +08:00
|
|
|
|
|
|
|
Label l;
|
|
|
|
build.b(ConditionA64::Plus, l);
|
|
|
|
build.cbz(x7, l);
|
|
|
|
|
2023-03-25 02:03:04 +08:00
|
|
|
build.ldp(x0, x1, mem(x8, 8));
|
|
|
|
build.adr(x0, l);
|
|
|
|
|
2023-04-01 02:42:49 +08:00
|
|
|
build.fabs(d1, d2);
|
|
|
|
build.ldr(q1, x2);
|
|
|
|
|
|
|
|
build.csel(x0, x1, x2, ConditionA64::Equal);
|
2023-04-15 02:06:22 +08:00
|
|
|
build.cset(x0, ConditionA64::Equal);
|
2023-04-01 02:42:49 +08:00
|
|
|
|
|
|
|
build.fcmp(d0, d1);
|
|
|
|
build.fcmpz(d0);
|
|
|
|
|
2023-04-22 06:14:26 +08:00
|
|
|
build.fmov(d0, 0.25);
|
|
|
|
build.tbz(x0, 5, l);
|
|
|
|
|
2023-04-29 03:55:13 +08:00
|
|
|
build.fcvt(s1, d2);
|
|
|
|
|
2023-05-20 03:37:30 +08:00
|
|
|
build.ubfx(x1, x2, 37, 5);
|
|
|
|
|
2023-06-10 01:08:00 +08:00
|
|
|
build.ldr(x0, mem(x1, 1));
|
|
|
|
build.ldr(x0, mem(x1, 1, AddressKindA64::pre));
|
|
|
|
build.ldr(x0, mem(x1, 1, AddressKindA64::post));
|
|
|
|
|
2023-09-02 01:58:27 +08:00
|
|
|
build.add(x1, x2, w3, 3);
|
|
|
|
|
2024-01-27 11:20:56 +08:00
|
|
|
build.ins_4s(q29, w17, 3);
|
|
|
|
build.ins_4s(q31, 1, q29, 2);
|
|
|
|
build.dup_4s(s29, q31, 2);
|
|
|
|
build.dup_4s(q29, q30, 0);
|
Improve A64 lowering for vector operations by using vector instructions (#1164)
This change replaces scalar versions of vector opcodes for A64 with
actual vector instructions.
We take the approach similar to X64: patch last component with zero,
perform the math, patch last component with type tag. I'm hoping that in
the future the type tag will be placed separately (separate IR opcode?)
because right now chains of math operations result in excessive type tag
operations.
To patch the type tag without always keeping a mask in a register,
ins.4s instructions can be used; unfortunately it's only capable of
patching a register in-place, so we need an extra register copy in case
it's not last-use. Usually it's last-use so the patch is free; probably
with IR rework mentioned above all of this can be improved (e.g.
load-with-patch will never need to copy).
~It's not 100% clear if we *have* to patch type tag: Apple does preserve
denormals but we'd need to benchmark this to see if there's an actual
performance impact. But for now we're playing it safe.~
This was tested by running the conformance tests, and new opcode
implementations were checked by comparing the result with
https://armconverter.com/.
Performance testing is complicated by the fact that OSS Luau doesn't
support vector constructor out of the box, and other limitations of
codegen. I've hacked vector constructor/type into REPL and confirmed
that on a test that calls this function in a loop (not inlined):
```
function fma(a: vector, b: vector, c: vector)
return a * b + c
end
```
... this PR improves performance by ~6% (note that probably most of the
overhead here is the call dispatch; I didn't want to brave testing a
more complex expression). The assembly for an individual operation
changes as follows:
Before:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
dup s29,v31.s[0]
dup s28,v30.s[0]
fmul s29,s29,s28
ins v31.s[0],v29.s[0]
dup s29,v31.s[1]
dup s28,v30.s[1]
fmul s29,s29,s28
ins v31.s[1],v29.s[0]
dup s29,v31.s[2]
dup s28,v30.s[2]
fmul s29,s29,s28
ins v31.s[2],v29.s[0]
```
After:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
ins v31.s[3],w31
ins v30.s[3],w31
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
**edit** final form (see comments):
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
2024-02-17 00:30:35 +08:00
|
|
|
build.fmul(q0, q1, q2);
|
2024-01-27 11:20:56 +08:00
|
|
|
|
2022-11-05 01:33:22 +08:00
|
|
|
build.setLabel(l);
|
|
|
|
build.ret();
|
|
|
|
|
|
|
|
build.finalize();
|
|
|
|
|
|
|
|
std::string expected = R"(
|
2022-11-11 06:53:13 +08:00
|
|
|
add sp,sp,#4
|
2022-11-05 01:33:22 +08:00
|
|
|
add w0,w1,w2
|
|
|
|
add x0,x1,x2 LSL #2
|
2023-04-22 06:14:26 +08:00
|
|
|
add x0,x1,x2 LSR #2
|
2022-11-05 01:33:22 +08:00
|
|
|
add w7,w8,#5
|
|
|
|
add x7,x8,#5
|
|
|
|
ldr x7,[x8]
|
|
|
|
ldr x7,[x8,#8]
|
|
|
|
ldr x7,[x8,x9]
|
|
|
|
mov x1,x2
|
|
|
|
movk x1,#42 LSL #16
|
2022-11-11 06:53:13 +08:00
|
|
|
cmp x1,x2
|
|
|
|
blr x0
|
2022-11-05 01:33:22 +08:00
|
|
|
b.pl .L1
|
|
|
|
cbz x7,.L1
|
2023-03-25 02:03:04 +08:00
|
|
|
ldp x0,x1,[x8,#8]
|
|
|
|
adr x0,.L1
|
2023-04-01 02:42:49 +08:00
|
|
|
fabs d1,d2
|
|
|
|
ldr q1,[x2]
|
|
|
|
csel x0,x1,x2,eq
|
2023-04-15 02:06:22 +08:00
|
|
|
cset x0,eq
|
2023-04-01 02:42:49 +08:00
|
|
|
fcmp d0,d1
|
|
|
|
fcmp d0,#0
|
2023-04-22 06:14:26 +08:00
|
|
|
fmov d0,#0.25
|
|
|
|
tbz x0,#5,.L1
|
2023-04-29 03:55:13 +08:00
|
|
|
fcvt s1,d2
|
2023-05-20 03:37:30 +08:00
|
|
|
ubfx x1,x2,#3705
|
2023-06-10 01:08:00 +08:00
|
|
|
ldr x0,[x1,#1]
|
|
|
|
ldr x0,[x1,#1]!
|
|
|
|
ldr x0,[x1]!,#1
|
2023-09-02 01:58:27 +08:00
|
|
|
add x1,x2,w3 UXTW #3
|
2024-01-27 11:20:56 +08:00
|
|
|
ins v29.s[3],w17
|
|
|
|
ins v31.s[1],v29.s[2]
|
|
|
|
dup s29,v31.s[2]
|
|
|
|
dup v29.4s,v30.s[0]
|
Improve A64 lowering for vector operations by using vector instructions (#1164)
This change replaces scalar versions of vector opcodes for A64 with
actual vector instructions.
We take the approach similar to X64: patch last component with zero,
perform the math, patch last component with type tag. I'm hoping that in
the future the type tag will be placed separately (separate IR opcode?)
because right now chains of math operations result in excessive type tag
operations.
To patch the type tag without always keeping a mask in a register,
ins.4s instructions can be used; unfortunately it's only capable of
patching a register in-place, so we need an extra register copy in case
it's not last-use. Usually it's last-use so the patch is free; probably
with IR rework mentioned above all of this can be improved (e.g.
load-with-patch will never need to copy).
~It's not 100% clear if we *have* to patch type tag: Apple does preserve
denormals but we'd need to benchmark this to see if there's an actual
performance impact. But for now we're playing it safe.~
This was tested by running the conformance tests, and new opcode
implementations were checked by comparing the result with
https://armconverter.com/.
Performance testing is complicated by the fact that OSS Luau doesn't
support vector constructor out of the box, and other limitations of
codegen. I've hacked vector constructor/type into REPL and confirmed
that on a test that calls this function in a loop (not inlined):
```
function fma(a: vector, b: vector, c: vector)
return a * b + c
end
```
... this PR improves performance by ~6% (note that probably most of the
overhead here is the call dispatch; I didn't want to brave testing a
more complex expression). The assembly for an individual operation
changes as follows:
Before:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
dup s29,v31.s[0]
dup s28,v30.s[0]
fmul s29,s29,s28
ins v31.s[0],v29.s[0]
dup s29,v31.s[1]
dup s28,v30.s[1]
fmul s29,s29,s28
ins v31.s[1],v29.s[0]
dup s29,v31.s[2]
dup s28,v30.s[2]
fmul s29,s29,s28
ins v31.s[2],v29.s[0]
```
After:
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
ins v31.s[3],w31
ins v30.s[3],w31
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
**edit** final form (see comments):
```
# %14 = MUL_VEC %12, %13 ; useCount: 2, lastUse: %22
fmul v31.4s,v31.4s,v30.4s
movz w17,#4
ins v31.s[3],w17
```
2024-02-17 00:30:35 +08:00
|
|
|
fmul v0.4s,v1.4s,v2.4s
|
2022-11-05 01:33:22 +08:00
|
|
|
.L1:
|
|
|
|
ret
|
|
|
|
)";
|
|
|
|
|
|
|
|
CHECK("\n" + build.text == expected);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_SUITE_END();
|