mirror of
https://github.com/luau-lang/luau.git
synced 2024-11-15 14:25:44 +08:00
e6bf71871a
Some checks failed
benchmark / callgrind (map[branch:main name:luau-lang/benchmark-data], ubuntu-22.04) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:macos version:macos-latest]) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:macos-arm version:macos-14]) (push) Has been cancelled
build / ${{matrix.os.name}} (map[name:ubuntu version:ubuntu-latest]) (push) Has been cancelled
build / windows (Win32) (push) Has been cancelled
build / windows (x64) (push) Has been cancelled
build / coverage (push) Has been cancelled
build / web (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:macos version:macos-latest]) (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:ubuntu version:ubuntu-20.04]) (push) Has been cancelled
release / ${{matrix.os.name}} (map[name:windows version:windows-latest]) (push) Has been cancelled
release / web (push) Has been cancelled
Instead of doing the dot product related math in scalar IR, we lift the computation into a dedicated IR instruction. On x64, we can use VDPPS which was more or less tailor made for this purpose. This is better than manual scalar lowering that requires reloading components from memory; it's not always a strict improvement over the shuffle+add version (which we never had), but this can now be adjusted in the IR lowering in an optimal fashion (maybe even based on CPU vendor, although that'd create issues for offline compilation). On A64, we can either use naive adds or paired adds, as there is no dedicated vector-wide horizontal instruction until SVE. Both run at about the same performance on M2, but paired adds require fewer instructions and temporaries. I've measured this using mesh-normal-vector benchmark, changing the benchmark to just report the time of the second loop inside `calculate_normals`, testing master vs #1504 vs this PR, also increasing the grid size to 400 for more stable timings. On Zen 4 (7950X), this PR is comfortably ~8% faster vs master, while I see neutral to negative results in #1504. On M2 (base), this PR is ~28% faster vs master, while #1504 is only about ~10% faster. If I measure the second loop in `calculate_tangent_space` instead, I get: On Zen 4 (7950X), this PR is ~12% faster vs master, while #1504 is ~3% faster On M2 (base), this PR is ~24% faster vs master, while #1504 is only about ~13% faster. Note that the loops in question are not quite optimal, as they store and reload various vectors to dictionary values due to inappropriate use of locals. The underlying gains in individual functions are thus larger than the numbers above; for example, changing the `calculate_normals` loop to use a local variable to store the normalized vector (but still saving the result to dictionary value), I get a ~24% performance increase from this PR on Zen4 vs master instead of just 8% (#1504 is ~15% slower in this setup).
805 lines
33 KiB
C++
805 lines
33 KiB
C++
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
|
#include "Luau/AssemblyBuilderX64.h"
|
|
#include "Luau/StringUtils.h"
|
|
|
|
#include "doctest.h"
|
|
#include "ScopedFlags.h"
|
|
|
|
#include <string.h>
|
|
|
|
using namespace Luau::CodeGen;
|
|
using namespace Luau::CodeGen::X64;
|
|
|
|
static std::string bytecodeAsArray(const std::vector<uint8_t>& bytecode)
|
|
{
|
|
std::string result = "{";
|
|
|
|
for (size_t i = 0; i < bytecode.size(); i++)
|
|
Luau::formatAppend(result, "%s0x%02x", i == 0 ? "" : ", ", bytecode[i]);
|
|
|
|
return result.append("}");
|
|
}
|
|
|
|
class AssemblyBuilderX64Fixture
|
|
{
|
|
public:
|
|
bool check(void (*f)(AssemblyBuilderX64& build), std::vector<uint8_t> code, std::vector<uint8_t> data = {})
|
|
{
|
|
AssemblyBuilderX64 build(/* logText= */ false);
|
|
|
|
f(build);
|
|
|
|
build.finalize();
|
|
|
|
if (build.code != code)
|
|
{
|
|
printf("Expected code: %s\nReceived code: %s\n", bytecodeAsArray(code).c_str(), bytecodeAsArray(build.code).c_str());
|
|
return false;
|
|
}
|
|
|
|
if (build.data != data)
|
|
{
|
|
printf("Expected data: %s\nReceived data: %s\n", bytecodeAsArray(data).c_str(), bytecodeAsArray(build.data).c_str());
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
TEST_SUITE_BEGIN("x64Assembly");
|
|
|
|
#define SINGLE_COMPARE(inst, ...) \
|
|
CHECK(check( \
|
|
[](AssemblyBuilderX64& build) \
|
|
{ \
|
|
build.inst; \
|
|
}, \
|
|
{__VA_ARGS__} \
|
|
))
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "BaseBinaryInstructionForms")
|
|
{
|
|
// reg, reg
|
|
SINGLE_COMPARE(add(rax, rcx), 0x48, 0x03, 0xc1);
|
|
SINGLE_COMPARE(add(rsp, r12), 0x49, 0x03, 0xe4);
|
|
SINGLE_COMPARE(add(r14, r10), 0x4d, 0x03, 0xf2);
|
|
|
|
// reg, imm
|
|
SINGLE_COMPARE(add(rax, 0), 0x48, 0x83, 0xc0, 0x00);
|
|
SINGLE_COMPARE(add(rax, 0x7f), 0x48, 0x83, 0xc0, 0x7f);
|
|
SINGLE_COMPARE(add(rax, 0x80), 0x48, 0x81, 0xc0, 0x80, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r10, 0x7fffffff), 0x49, 0x81, 0xc2, 0xff, 0xff, 0xff, 0x7f);
|
|
SINGLE_COMPARE(add(al, 3), 0x80, 0xc0, 0x03);
|
|
SINGLE_COMPARE(add(sil, 3), 0x48, 0x80, 0xc6, 0x03);
|
|
SINGLE_COMPARE(add(r11b, 3), 0x49, 0x80, 0xc3, 0x03);
|
|
|
|
// reg, [reg]
|
|
SINGLE_COMPARE(add(rax, qword[rax]), 0x48, 0x03, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rbx]), 0x48, 0x03, 0x03);
|
|
SINGLE_COMPARE(add(rax, qword[rsp]), 0x48, 0x03, 0x04, 0x24);
|
|
SINGLE_COMPARE(add(rax, qword[rbp]), 0x48, 0x03, 0x45, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r10]), 0x49, 0x03, 0x02);
|
|
SINGLE_COMPARE(add(rax, qword[r12]), 0x49, 0x03, 0x04, 0x24);
|
|
SINGLE_COMPARE(add(rax, qword[r13]), 0x49, 0x03, 0x45, 0x00);
|
|
|
|
SINGLE_COMPARE(add(r12, qword[rax]), 0x4c, 0x03, 0x20);
|
|
SINGLE_COMPARE(add(r12, qword[rbx]), 0x4c, 0x03, 0x23);
|
|
SINGLE_COMPARE(add(r12, qword[rsp]), 0x4c, 0x03, 0x24, 0x24);
|
|
SINGLE_COMPARE(add(r12, qword[rbp]), 0x4c, 0x03, 0x65, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r10]), 0x4d, 0x03, 0x22);
|
|
SINGLE_COMPARE(add(r12, qword[r12]), 0x4d, 0x03, 0x24, 0x24);
|
|
SINGLE_COMPARE(add(r12, qword[r13]), 0x4d, 0x03, 0x65, 0x00);
|
|
|
|
// reg, [base+imm8]
|
|
SINGLE_COMPARE(add(rax, qword[rax + 0x1b]), 0x48, 0x03, 0x40, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rbx + 0x1b]), 0x48, 0x03, 0x43, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rsp + 0x1b]), 0x48, 0x03, 0x44, 0x24, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rbp + 0x1b]), 0x48, 0x03, 0x45, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[r10 + 0x1b]), 0x49, 0x03, 0x42, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[r12 + 0x1b]), 0x49, 0x03, 0x44, 0x24, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[r13 + 0x1b]), 0x49, 0x03, 0x45, 0x1b);
|
|
|
|
SINGLE_COMPARE(add(r12, qword[rax + 0x1b]), 0x4c, 0x03, 0x60, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[rbx + 0x1b]), 0x4c, 0x03, 0x63, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[rsp + 0x1b]), 0x4c, 0x03, 0x64, 0x24, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[rbp + 0x1b]), 0x4c, 0x03, 0x65, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[r10 + 0x1b]), 0x4d, 0x03, 0x62, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[r12 + 0x1b]), 0x4d, 0x03, 0x64, 0x24, 0x1b);
|
|
SINGLE_COMPARE(add(r12, qword[r13 + 0x1b]), 0x4d, 0x03, 0x65, 0x1b);
|
|
|
|
// reg, [base+imm32]
|
|
SINGLE_COMPARE(add(rax, qword[rax + 0xabab]), 0x48, 0x03, 0x80, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rbx + 0xabab]), 0x48, 0x03, 0x83, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rsp + 0xabab]), 0x48, 0x03, 0x84, 0x24, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rbp + 0xabab]), 0x48, 0x03, 0x85, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r10 + 0xabab]), 0x49, 0x03, 0x82, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r12 + 0xabab]), 0x49, 0x03, 0x84, 0x24, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r13 + 0xabab]), 0x49, 0x03, 0x85, 0xab, 0xab, 0x00, 0x00);
|
|
|
|
SINGLE_COMPARE(add(r12, qword[rax + 0xabab]), 0x4c, 0x03, 0xa0, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rbx + 0xabab]), 0x4c, 0x03, 0xa3, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rsp + 0xabab]), 0x4c, 0x03, 0xa4, 0x24, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rbp + 0xabab]), 0x4c, 0x03, 0xa5, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r10 + 0xabab]), 0x4d, 0x03, 0xa2, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r12 + 0xabab]), 0x4d, 0x03, 0xa4, 0x24, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r13 + 0xabab]), 0x4d, 0x03, 0xa5, 0xab, 0xab, 0x00, 0x00);
|
|
|
|
// reg, [index*scale]
|
|
SINGLE_COMPARE(add(rax, qword[rax * 2]), 0x48, 0x03, 0x04, 0x45, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rbx * 2]), 0x48, 0x03, 0x04, 0x5d, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rbp * 2]), 0x48, 0x03, 0x04, 0x6d, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r10 * 2]), 0x4a, 0x03, 0x04, 0x55, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r12 * 2]), 0x4a, 0x03, 0x04, 0x65, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[r13 * 2]), 0x4a, 0x03, 0x04, 0x6d, 0x00, 0x00, 0x00, 0x00);
|
|
|
|
SINGLE_COMPARE(add(r12, qword[rax * 2]), 0x4c, 0x03, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rbx * 2]), 0x4c, 0x03, 0x24, 0x5d, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rbp * 2]), 0x4c, 0x03, 0x24, 0x6d, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r10 * 2]), 0x4e, 0x03, 0x24, 0x55, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r12 * 2]), 0x4e, 0x03, 0x24, 0x65, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[r13 * 2]), 0x4e, 0x03, 0x24, 0x6d, 0x00, 0x00, 0x00, 0x00);
|
|
|
|
// reg, [base+index*scale+imm]
|
|
SINGLE_COMPARE(add(rax, qword[rax + rax * 2]), 0x48, 0x03, 0x04, 0x40);
|
|
SINGLE_COMPARE(add(rax, qword[rax + rbx * 2 + 0x1b]), 0x48, 0x03, 0x44, 0x58, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rax + rbp * 2]), 0x48, 0x03, 0x04, 0x68);
|
|
SINGLE_COMPARE(add(rax, qword[rax + rbp + 0xabab]), 0x48, 0x03, 0x84, 0x28, 0xAB, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rax + r12 + 0x1b]), 0x4a, 0x03, 0x44, 0x20, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rax + r12 * 4 + 0xabab]), 0x4a, 0x03, 0x84, 0xa0, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[rax + r13 * 2 + 0x1b]), 0x4a, 0x03, 0x44, 0x68, 0x1b);
|
|
SINGLE_COMPARE(add(rax, qword[rax + r13 + 0xabab]), 0x4a, 0x03, 0x84, 0x28, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rax + r12 * 2]), 0x4e, 0x03, 0x24, 0x60);
|
|
SINGLE_COMPARE(add(r12, qword[rax + r13 + 0xabab]), 0x4e, 0x03, 0xA4, 0x28, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(r12, qword[rax + rbp * 2 + 0x1b]), 0x4c, 0x03, 0x64, 0x68, 0x1b);
|
|
|
|
// reg, [imm32]
|
|
SINGLE_COMPARE(add(rax, qword[0]), 0x48, 0x03, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(rax, qword[0xabab]), 0x48, 0x03, 0x04, 0x25, 0xab, 0xab, 0x00, 0x00);
|
|
|
|
// [addr], reg
|
|
SINGLE_COMPARE(add(qword[rax], rax), 0x48, 0x01, 0x00);
|
|
SINGLE_COMPARE(add(qword[rax + rax * 4 + 0xabab], rax), 0x48, 0x01, 0x84, 0x80, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(qword[rbx + rax * 2 + 0x1b], rax), 0x48, 0x01, 0x44, 0x43, 0x1b);
|
|
SINGLE_COMPARE(add(qword[rbx + rbp * 2 + 0x1b], rax), 0x48, 0x01, 0x44, 0x6b, 0x1b);
|
|
SINGLE_COMPARE(add(qword[rbp + rbp * 4 + 0xabab], rax), 0x48, 0x01, 0x84, 0xad, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(qword[rbp + r12 + 0x1b], rax), 0x4a, 0x01, 0x44, 0x25, 0x1b);
|
|
SINGLE_COMPARE(add(qword[r12], rax), 0x49, 0x01, 0x04, 0x24);
|
|
SINGLE_COMPARE(add(qword[r13 + rbx + 0xabab], rax), 0x49, 0x01, 0x84, 0x1d, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(qword[rax + r13 * 2 + 0x1b], rsi), 0x4a, 0x01, 0x74, 0x68, 0x1b);
|
|
SINGLE_COMPARE(add(qword[rbp + rbx * 2], rsi), 0x48, 0x01, 0x74, 0x5d, 0x00);
|
|
SINGLE_COMPARE(add(qword[rsp + r10 * 2 + 0x1b], r10), 0x4e, 0x01, 0x54, 0x54, 0x1b);
|
|
|
|
// [addr], imm
|
|
SINGLE_COMPARE(add(byte[rax], 2), 0x80, 0x00, 0x02);
|
|
SINGLE_COMPARE(add(dword[rax], 2), 0x83, 0x00, 0x02);
|
|
SINGLE_COMPARE(add(dword[rax], 0xabcd), 0x81, 0x00, 0xcd, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(add(qword[rax], 2), 0x48, 0x83, 0x00, 0x02);
|
|
SINGLE_COMPARE(add(qword[rax], 0xabcd), 0x48, 0x81, 0x00, 0xcd, 0xab, 0x00, 0x00);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "BaseUnaryInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(div(rcx), 0x48, 0xf7, 0xf1);
|
|
SINGLE_COMPARE(idiv(qword[rax]), 0x48, 0xf7, 0x38);
|
|
SINGLE_COMPARE(mul(qword[rax + rbx]), 0x48, 0xf7, 0x24, 0x18);
|
|
SINGLE_COMPARE(imul(r9), 0x49, 0xf7, 0xe9);
|
|
SINGLE_COMPARE(neg(r9), 0x49, 0xf7, 0xd9);
|
|
SINGLE_COMPARE(not_(r12), 0x49, 0xf7, 0xd4);
|
|
SINGLE_COMPARE(inc(r12), 0x49, 0xff, 0xc4);
|
|
SINGLE_COMPARE(dec(ecx), 0xff, 0xc9);
|
|
SINGLE_COMPARE(dec(byte[rdx]), 0xfe, 0x0a);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMov")
|
|
{
|
|
SINGLE_COMPARE(mov(rcx, 1), 0x48, 0xb9, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov64(rcx, 0x1234567812345678ll), 0x48, 0xb9, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12);
|
|
SINGLE_COMPARE(mov(ecx, 2), 0xb9, 0x02, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov(cl, 2), 0xb1, 0x02);
|
|
SINGLE_COMPARE(mov(sil, 2), 0x48, 0xb6, 0x02);
|
|
SINGLE_COMPARE(mov(r9b, 2), 0x49, 0xb1, 0x02);
|
|
SINGLE_COMPARE(mov(rcx, qword[rdi]), 0x48, 0x8b, 0x0f);
|
|
SINGLE_COMPARE(mov(dword[rax], 0xabcd), 0xc7, 0x00, 0xcd, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov(r13, 1), 0x49, 0xbd, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov64(r13, 0x1234567812345678ll), 0x49, 0xbd, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56, 0x34, 0x12);
|
|
SINGLE_COMPARE(mov(r13d, 2), 0x41, 0xbd, 0x02, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov(r13, qword[r12]), 0x4d, 0x8b, 0x2c, 0x24);
|
|
SINGLE_COMPARE(mov(dword[r13], 0xabcd), 0x41, 0xc7, 0x45, 0x00, 0xcd, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(mov(qword[rdx], r9), 0x4c, 0x89, 0x0a);
|
|
SINGLE_COMPARE(mov(byte[rsi], 0x3), 0xc6, 0x06, 0x03);
|
|
SINGLE_COMPARE(mov(byte[rsi], al), 0x88, 0x06);
|
|
SINGLE_COMPARE(mov(byte[rsi], dil), 0x48, 0x88, 0x3e);
|
|
SINGLE_COMPARE(mov(byte[rsi], r10b), 0x4c, 0x88, 0x16);
|
|
SINGLE_COMPARE(mov(wordReg(ebx), 0x3a3d), 0x66, 0xbb, 0x3d, 0x3a);
|
|
SINGLE_COMPARE(mov(word[rsi], 0x3a3d), 0x66, 0xc7, 0x06, 0x3d, 0x3a);
|
|
SINGLE_COMPARE(mov(word[rsi], wordReg(eax)), 0x66, 0x89, 0x06);
|
|
SINGLE_COMPARE(mov(word[rsi], wordReg(edi)), 0x66, 0x89, 0x3e);
|
|
SINGLE_COMPARE(mov(word[rsi], wordReg(r10)), 0x66, 0x44, 0x89, 0x16);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfMovExtended")
|
|
{
|
|
SINGLE_COMPARE(movsx(eax, byte[rcx]), 0x0f, 0xbe, 0x01);
|
|
SINGLE_COMPARE(movsx(r12, byte[r10]), 0x4d, 0x0f, 0xbe, 0x22);
|
|
SINGLE_COMPARE(movsx(ebx, word[r11]), 0x41, 0x0f, 0xbf, 0x1b);
|
|
SINGLE_COMPARE(movsx(rdx, word[rcx]), 0x48, 0x0f, 0xbf, 0x11);
|
|
SINGLE_COMPARE(movzx(eax, byte[rcx]), 0x0f, 0xb6, 0x01);
|
|
SINGLE_COMPARE(movzx(r12, byte[r10]), 0x4d, 0x0f, 0xb6, 0x22);
|
|
SINGLE_COMPARE(movzx(ebx, word[r11]), 0x41, 0x0f, 0xb7, 0x1b);
|
|
SINGLE_COMPARE(movzx(rdx, word[rcx]), 0x48, 0x0f, 0xb7, 0x11);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfTest")
|
|
{
|
|
SINGLE_COMPARE(test(al, 8), 0xf6, 0xc0, 0x08);
|
|
SINGLE_COMPARE(test(eax, 8), 0xf7, 0xc0, 0x08, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(test(rax, 8), 0x48, 0xf7, 0xc0, 0x08, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(test(rcx, 0xabab), 0x48, 0xf7, 0xc1, 0xab, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(test(rcx, rax), 0x48, 0x85, 0xc8);
|
|
SINGLE_COMPARE(test(rax, qword[rcx]), 0x48, 0x85, 0x01);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfShift")
|
|
{
|
|
SINGLE_COMPARE(shl(al, 1), 0xd0, 0xe0);
|
|
SINGLE_COMPARE(shl(al, cl), 0xd2, 0xe0);
|
|
SINGLE_COMPARE(shl(sil, cl), 0x48, 0xd2, 0xe6);
|
|
SINGLE_COMPARE(shl(r10b, cl), 0x49, 0xd2, 0xe2);
|
|
SINGLE_COMPARE(shr(al, 4), 0xc0, 0xe8, 0x04);
|
|
SINGLE_COMPARE(shr(eax, 1), 0xd1, 0xe8);
|
|
SINGLE_COMPARE(sal(eax, cl), 0xd3, 0xe0);
|
|
SINGLE_COMPARE(sal(eax, 4), 0xc1, 0xe0, 0x04);
|
|
SINGLE_COMPARE(sar(rax, 4), 0x48, 0xc1, 0xf8, 0x04);
|
|
SINGLE_COMPARE(sar(r11, 1), 0x49, 0xd1, 0xfb);
|
|
SINGLE_COMPARE(rol(eax, 1), 0xd1, 0xc0);
|
|
SINGLE_COMPARE(rol(eax, cl), 0xd3, 0xc0);
|
|
SINGLE_COMPARE(ror(eax, 1), 0xd1, 0xc8);
|
|
SINGLE_COMPARE(ror(eax, cl), 0xd3, 0xc8);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfLea")
|
|
{
|
|
SINGLE_COMPARE(lea(rax, addr[rdx + rcx]), 0x48, 0x8d, 0x04, 0x0a);
|
|
SINGLE_COMPARE(lea(rax, addr[rdx + rax * 4]), 0x48, 0x8d, 0x04, 0x82);
|
|
SINGLE_COMPARE(lea(rax, addr[r13 + r12 * 4 + 4]), 0x4b, 0x8d, 0x44, 0xa5, 0x04);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfSetcc")
|
|
{
|
|
SINGLE_COMPARE(setcc(ConditionX64::NotEqual, bl), 0x0f, 0x95, 0xc3);
|
|
SINGLE_COMPARE(setcc(ConditionX64::NotEqual, dil), 0x48, 0x0f, 0x95, 0xc7);
|
|
SINGLE_COMPARE(setcc(ConditionX64::BelowEqual, byte[rcx]), 0x0f, 0x96, 0x01);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfCmov")
|
|
{
|
|
SINGLE_COMPARE(cmov(ConditionX64::LessEqual, ebx, eax), 0x0f, 0x4e, 0xd8);
|
|
SINGLE_COMPARE(cmov(ConditionX64::NotZero, rbx, qword[rax]), 0x48, 0x0f, 0x45, 0x18);
|
|
SINGLE_COMPARE(cmov(ConditionX64::Zero, rbx, qword[rax + rcx]), 0x48, 0x0f, 0x44, 0x1c, 0x08);
|
|
SINGLE_COMPARE(cmov(ConditionX64::BelowEqual, r14d, r15d), 0x45, 0x0f, 0x46, 0xf7);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfAbsoluteJumps")
|
|
{
|
|
SINGLE_COMPARE(jmp(rax), 0xff, 0xe0);
|
|
SINGLE_COMPARE(jmp(r14), 0x41, 0xff, 0xe6);
|
|
SINGLE_COMPARE(jmp(qword[r14 + rdx * 4]), 0x41, 0xff, 0x24, 0x96);
|
|
SINGLE_COMPARE(call(rax), 0xff, 0xd0);
|
|
SINGLE_COMPARE(call(r14), 0x41, 0xff, 0xd6);
|
|
SINGLE_COMPARE(call(qword[r14 + rdx * 4]), 0x41, 0xff, 0x14, 0x96);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "FormsOfImul")
|
|
{
|
|
SINGLE_COMPARE(imul(ecx, esi), 0x0f, 0xaf, 0xce);
|
|
SINGLE_COMPARE(imul(r12, rax), 0x4c, 0x0f, 0xaf, 0xe0);
|
|
SINGLE_COMPARE(imul(r12, qword[rdx + rdi]), 0x4c, 0x0f, 0xaf, 0x24, 0x3a);
|
|
SINGLE_COMPARE(imul(ecx, edx, 8), 0x6b, 0xca, 0x08);
|
|
SINGLE_COMPARE(imul(ecx, r9d, 0xabcd), 0x41, 0x69, 0xc9, 0xcd, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(imul(r8d, eax, -9), 0x44, 0x6b, 0xc0, 0xf7);
|
|
SINGLE_COMPARE(imul(rcx, rdx, 17), 0x48, 0x6b, 0xca, 0x11);
|
|
SINGLE_COMPARE(imul(rcx, r12, 0xabcd), 0x49, 0x69, 0xcc, 0xcd, 0xab, 0x00, 0x00);
|
|
SINGLE_COMPARE(imul(r12, rax, -13), 0x4c, 0x6b, 0xe0, 0xf3);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "NopForms")
|
|
{
|
|
SINGLE_COMPARE(nop(), 0x90);
|
|
SINGLE_COMPARE(nop(2), 0x66, 0x90);
|
|
SINGLE_COMPARE(nop(3), 0x0f, 0x1f, 0x00);
|
|
SINGLE_COMPARE(nop(4), 0x0f, 0x1f, 0x40, 0x00);
|
|
SINGLE_COMPARE(nop(5), 0x0f, 0x1f, 0x44, 0x00, 0x00);
|
|
SINGLE_COMPARE(nop(6), 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00);
|
|
SINGLE_COMPARE(nop(7), 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(nop(8), 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(nop(9), 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00);
|
|
SINGLE_COMPARE(nop(15), 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00); // 9+6
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AlignmentForms")
|
|
{
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
build.ret();
|
|
build.align(8, AlignmentDataX64::Nop);
|
|
},
|
|
{0xc3, 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}
|
|
));
|
|
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
build.ret();
|
|
build.align(32, AlignmentDataX64::Nop);
|
|
},
|
|
{0xc3, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00,
|
|
0x00, 0x00, 0x00, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x1f, 0x40, 0x00}
|
|
));
|
|
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
build.ret();
|
|
build.align(8, AlignmentDataX64::Int3);
|
|
},
|
|
{0xc3, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc}
|
|
));
|
|
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
build.ret();
|
|
build.align(8, AlignmentDataX64::Ud2);
|
|
},
|
|
{0xc3, 0x0f, 0x0b, 0x0f, 0x0b, 0x0f, 0x0b, 0xcc}
|
|
));
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AlignmentOverflow")
|
|
{
|
|
// Test that alignment correctly resizes the code buffer
|
|
{
|
|
AssemblyBuilderX64 build(/* logText */ false);
|
|
|
|
build.ret();
|
|
build.align(8192, AlignmentDataX64::Nop);
|
|
build.finalize();
|
|
}
|
|
|
|
{
|
|
AssemblyBuilderX64 build(/* logText */ false);
|
|
|
|
build.ret();
|
|
build.align(8192, AlignmentDataX64::Int3);
|
|
build.finalize();
|
|
}
|
|
|
|
{
|
|
AssemblyBuilderX64 build(/* logText */ false);
|
|
|
|
for (int i = 0; i < 8192; i++)
|
|
build.int3();
|
|
build.finalize();
|
|
}
|
|
|
|
{
|
|
AssemblyBuilderX64 build(/* logText */ false);
|
|
|
|
build.ret();
|
|
build.align(8192, AlignmentDataX64::Ud2);
|
|
build.finalize();
|
|
}
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "ControlFlow")
|
|
{
|
|
// Jump back
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label start = build.setLabel();
|
|
build.add(rsi, 1);
|
|
build.cmp(rsi, rdi);
|
|
build.jcc(ConditionX64::Equal, start);
|
|
},
|
|
{0x48, 0x83, 0xc6, 0x01, 0x48, 0x3b, 0xf7, 0x0f, 0x84, 0xf3, 0xff, 0xff, 0xff}
|
|
));
|
|
|
|
// Jump back, but the label is set before use
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label start;
|
|
build.add(rsi, 1);
|
|
build.setLabel(start);
|
|
build.cmp(rsi, rdi);
|
|
build.jcc(ConditionX64::Equal, start);
|
|
},
|
|
{0x48, 0x83, 0xc6, 0x01, 0x48, 0x3b, 0xf7, 0x0f, 0x84, 0xf7, 0xff, 0xff, 0xff}
|
|
));
|
|
|
|
// Jump forward
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label skip;
|
|
|
|
build.cmp(rsi, rdi);
|
|
build.jcc(ConditionX64::Greater, skip);
|
|
build.or_(rdi, 0x3e);
|
|
build.setLabel(skip);
|
|
},
|
|
{0x48, 0x3b, 0xf7, 0x0f, 0x8f, 0x04, 0x00, 0x00, 0x00, 0x48, 0x83, 0xcf, 0x3e}
|
|
));
|
|
|
|
// Regular jump
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label skip;
|
|
|
|
build.jmp(skip);
|
|
build.and_(rdi, 0x3e);
|
|
build.setLabel(skip);
|
|
},
|
|
{0xe9, 0x04, 0x00, 0x00, 0x00, 0x48, 0x83, 0xe7, 0x3e}
|
|
));
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "LabelCall")
|
|
{
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label fnB;
|
|
|
|
build.and_(rcx, 0x3e);
|
|
build.call(fnB);
|
|
build.ret();
|
|
|
|
build.setLabel(fnB);
|
|
build.lea(rax, addr[rcx + 0x1f]);
|
|
build.ret();
|
|
},
|
|
{0x48, 0x83, 0xe1, 0x3e, 0xe8, 0x01, 0x00, 0x00, 0x00, 0xc3, 0x48, 0x8d, 0x41, 0x1f, 0xc3}
|
|
));
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXBinaryInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(vaddpd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x29, 0x58, 0xc6);
|
|
SINGLE_COMPARE(vaddpd(xmm8, xmm10, xmmword[r9]), 0xc4, 0x41, 0x29, 0x58, 0x01);
|
|
SINGLE_COMPARE(vaddpd(ymm8, ymm10, ymm14), 0xc4, 0x41, 0x2d, 0x58, 0xc6);
|
|
SINGLE_COMPARE(vaddpd(ymm8, ymm10, ymmword[r9]), 0xc4, 0x41, 0x2d, 0x58, 0x01);
|
|
SINGLE_COMPARE(vaddps(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x28, 0x58, 0xc6);
|
|
SINGLE_COMPARE(vaddps(xmm8, xmm10, xmmword[r9]), 0xc4, 0x41, 0x28, 0x58, 0x01);
|
|
SINGLE_COMPARE(vaddsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x58, 0xc6);
|
|
SINGLE_COMPARE(vaddsd(xmm8, xmm10, qword[r9]), 0xc4, 0x41, 0x2b, 0x58, 0x01);
|
|
SINGLE_COMPARE(vaddss(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2a, 0x58, 0xc6);
|
|
SINGLE_COMPARE(vaddss(xmm8, xmm10, dword[r9]), 0xc4, 0x41, 0x2a, 0x58, 0x01);
|
|
|
|
SINGLE_COMPARE(vaddps(xmm1, xmm2, xmm3), 0xc4, 0xe1, 0x68, 0x58, 0xcb);
|
|
SINGLE_COMPARE(vaddps(xmm9, xmm12, xmmword[r9 + r14 * 2 + 0x1c]), 0xc4, 0x01, 0x18, 0x58, 0x4c, 0x71, 0x1c);
|
|
SINGLE_COMPARE(vaddps(ymm1, ymm2, ymm3), 0xc4, 0xe1, 0x6c, 0x58, 0xcb);
|
|
SINGLE_COMPARE(vaddps(ymm9, ymm12, ymmword[r9 + r14 * 2 + 0x1c]), 0xc4, 0x01, 0x1c, 0x58, 0x4c, 0x71, 0x1c);
|
|
|
|
// Coverage for other instructions that follow the same pattern
|
|
SINGLE_COMPARE(vsubsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5c, 0xc6);
|
|
SINGLE_COMPARE(vmulsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x59, 0xc6);
|
|
SINGLE_COMPARE(vdivsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5e, 0xc6);
|
|
|
|
SINGLE_COMPARE(vsubps(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x28, 0x5c, 0xc6);
|
|
SINGLE_COMPARE(vmulps(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x28, 0x59, 0xc6);
|
|
SINGLE_COMPARE(vdivps(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x28, 0x5e, 0xc6);
|
|
|
|
SINGLE_COMPARE(vorpd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x29, 0x56, 0xc6);
|
|
SINGLE_COMPARE(vxorpd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x29, 0x57, 0xc6);
|
|
SINGLE_COMPARE(vorps(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x28, 0x56, 0xc6);
|
|
|
|
SINGLE_COMPARE(vandpd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x29, 0x54, 0xc6);
|
|
SINGLE_COMPARE(vandnpd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x29, 0x55, 0xc6);
|
|
|
|
SINGLE_COMPARE(vmaxsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5f, 0xc6);
|
|
SINGLE_COMPARE(vminsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x5d, 0xc6);
|
|
|
|
SINGLE_COMPARE(vcmpltsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0xc2, 0xc6, 0x01);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXUnaryMergeInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(vsqrtpd(xmm8, xmm10), 0xc4, 0x41, 0x79, 0x51, 0xc2);
|
|
SINGLE_COMPARE(vsqrtpd(xmm8, xmmword[r9]), 0xc4, 0x41, 0x79, 0x51, 0x01);
|
|
SINGLE_COMPARE(vsqrtpd(ymm8, ymm10), 0xc4, 0x41, 0x7d, 0x51, 0xc2);
|
|
SINGLE_COMPARE(vsqrtpd(ymm8, ymmword[r9]), 0xc4, 0x41, 0x7d, 0x51, 0x01);
|
|
SINGLE_COMPARE(vsqrtps(xmm8, xmm10), 0xc4, 0x41, 0x78, 0x51, 0xc2);
|
|
SINGLE_COMPARE(vsqrtps(xmm8, xmmword[r9]), 0xc4, 0x41, 0x78, 0x51, 0x01);
|
|
SINGLE_COMPARE(vsqrtsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x51, 0xc6);
|
|
SINGLE_COMPARE(vsqrtsd(xmm8, xmm10, qword[r9]), 0xc4, 0x41, 0x2b, 0x51, 0x01);
|
|
SINGLE_COMPARE(vsqrtss(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2a, 0x51, 0xc6);
|
|
SINGLE_COMPARE(vsqrtss(xmm8, xmm10, dword[r9]), 0xc4, 0x41, 0x2a, 0x51, 0x01);
|
|
|
|
// Coverage for other instructions that follow the same pattern
|
|
SINGLE_COMPARE(vucomisd(xmm1, xmm4), 0xc4, 0xe1, 0x79, 0x2e, 0xcc);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXMoveInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(vmovsd(qword[r9], xmm10), 0xc4, 0x41, 0x7b, 0x11, 0x11);
|
|
SINGLE_COMPARE(vmovsd(xmm8, qword[r9]), 0xc4, 0x41, 0x7b, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovsd(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2b, 0x10, 0xc6);
|
|
SINGLE_COMPARE(vmovss(dword[r9], xmm10), 0xc4, 0x41, 0x7a, 0x11, 0x11);
|
|
SINGLE_COMPARE(vmovss(xmm8, dword[r9]), 0xc4, 0x41, 0x7a, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovss(xmm8, xmm10, xmm14), 0xc4, 0x41, 0x2a, 0x10, 0xc6);
|
|
SINGLE_COMPARE(vmovapd(xmm8, xmmword[r9]), 0xc4, 0x41, 0x79, 0x28, 0x01);
|
|
SINGLE_COMPARE(vmovapd(xmmword[r9], xmm10), 0xc4, 0x41, 0x79, 0x29, 0x11);
|
|
SINGLE_COMPARE(vmovapd(ymm8, ymmword[r9]), 0xc4, 0x41, 0x7d, 0x28, 0x01);
|
|
SINGLE_COMPARE(vmovaps(xmm8, xmmword[r9]), 0xc4, 0x41, 0x78, 0x28, 0x01);
|
|
SINGLE_COMPARE(vmovaps(xmmword[r9], xmm10), 0xc4, 0x41, 0x78, 0x29, 0x11);
|
|
SINGLE_COMPARE(vmovaps(ymm8, ymmword[r9]), 0xc4, 0x41, 0x7c, 0x28, 0x01);
|
|
SINGLE_COMPARE(vmovupd(xmm8, xmmword[r9]), 0xc4, 0x41, 0x79, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovupd(xmmword[r9], xmm10), 0xc4, 0x41, 0x79, 0x11, 0x11);
|
|
SINGLE_COMPARE(vmovupd(ymm8, ymmword[r9]), 0xc4, 0x41, 0x7d, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovups(xmm8, xmmword[r9]), 0xc4, 0x41, 0x78, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovups(xmmword[r9], xmm10), 0xc4, 0x41, 0x78, 0x11, 0x11);
|
|
SINGLE_COMPARE(vmovups(ymm8, ymmword[r9]), 0xc4, 0x41, 0x7c, 0x10, 0x01);
|
|
SINGLE_COMPARE(vmovq(xmm1, rbx), 0xc4, 0xe1, 0xf9, 0x6e, 0xcb);
|
|
SINGLE_COMPARE(vmovq(rbx, xmm1), 0xc4, 0xe1, 0xf9, 0x7e, 0xcb);
|
|
SINGLE_COMPARE(vmovq(xmm1, qword[r9]), 0xc4, 0xc1, 0xf9, 0x6e, 0x09);
|
|
SINGLE_COMPARE(vmovq(qword[r9], xmm1), 0xc4, 0xc1, 0xf9, 0x7e, 0x09);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXConversionInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(vcvttsd2si(ecx, xmm0), 0xc4, 0xe1, 0x7b, 0x2c, 0xc8);
|
|
SINGLE_COMPARE(vcvttsd2si(r9d, xmmword[rcx + rdx]), 0xc4, 0x61, 0x7b, 0x2c, 0x0c, 0x11);
|
|
SINGLE_COMPARE(vcvttsd2si(rdx, xmm0), 0xc4, 0xe1, 0xfb, 0x2c, 0xd0);
|
|
SINGLE_COMPARE(vcvttsd2si(r13, xmmword[rcx + rdx]), 0xc4, 0x61, 0xfb, 0x2c, 0x2c, 0x11);
|
|
SINGLE_COMPARE(vcvtsi2sd(xmm5, xmm10, ecx), 0xc4, 0xe1, 0x2b, 0x2a, 0xe9);
|
|
SINGLE_COMPARE(vcvtsi2sd(xmm6, xmm11, dword[rcx + rdx]), 0xc4, 0xe1, 0x23, 0x2a, 0x34, 0x11);
|
|
SINGLE_COMPARE(vcvtsi2sd(xmm5, xmm10, r13), 0xc4, 0xc1, 0xab, 0x2a, 0xed);
|
|
SINGLE_COMPARE(vcvtsi2sd(xmm6, xmm11, qword[rcx + rdx]), 0xc4, 0xe1, 0xa3, 0x2a, 0x34, 0x11);
|
|
SINGLE_COMPARE(vcvtsd2ss(xmm5, xmm10, xmm11), 0xc4, 0xc1, 0x2b, 0x5a, 0xeb);
|
|
SINGLE_COMPARE(vcvtsd2ss(xmm6, xmm11, qword[rcx + rdx]), 0xc4, 0xe1, 0xa3, 0x5a, 0x34, 0x11);
|
|
SINGLE_COMPARE(vcvtss2sd(xmm3, xmm8, xmm12), 0xc4, 0xc1, 0x3a, 0x5a, 0xdc);
|
|
SINGLE_COMPARE(vcvtss2sd(xmm4, xmm9, dword[rcx + rsi]), 0xc4, 0xe1, 0x32, 0x5a, 0x24, 0x31);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "AVXTernaryInstructionForms")
|
|
{
|
|
SINGLE_COMPARE(vroundsd(xmm7, xmm12, xmm3, RoundingModeX64::RoundToNegativeInfinity), 0xc4, 0xe3, 0x19, 0x0b, 0xfb, 0x09);
|
|
SINGLE_COMPARE(
|
|
vroundsd(xmm8, xmm13, xmmword[r13 + rdx], RoundingModeX64::RoundToPositiveInfinity), 0xc4, 0x43, 0x11, 0x0b, 0x44, 0x15, 0x00, 0x0a
|
|
);
|
|
SINGLE_COMPARE(vroundsd(xmm9, xmm14, xmmword[rcx + r10], RoundingModeX64::RoundToZero), 0xc4, 0x23, 0x09, 0x0b, 0x0c, 0x11, 0x0b);
|
|
SINGLE_COMPARE(vblendvpd(xmm7, xmm12, xmmword[rcx + r10], xmm5), 0xc4, 0xa3, 0x19, 0x4b, 0x3c, 0x11, 0x50);
|
|
|
|
SINGLE_COMPARE(vpshufps(xmm7, xmm12, xmmword[rcx + r10], 0b11010100), 0xc4, 0xa1, 0x18, 0xc6, 0x3c, 0x11, 0xd4);
|
|
SINGLE_COMPARE(vpinsrd(xmm7, xmm12, xmmword[rcx + r10], 2), 0xc4, 0xa3, 0x19, 0x22, 0x3c, 0x11, 0x02);
|
|
|
|
SINGLE_COMPARE(vdpps(xmm7, xmm12, xmmword[rcx + r10], 2), 0xc4, 0xa3, 0x19, 0x40, 0x3c, 0x11, 0x02);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "MiscInstructions")
|
|
{
|
|
SINGLE_COMPARE(int3(), 0xcc);
|
|
SINGLE_COMPARE(ud2(), 0x0f, 0x0b);
|
|
SINGLE_COMPARE(bsr(eax, edx), 0x0f, 0xbd, 0xc2);
|
|
SINGLE_COMPARE(bsf(eax, edx), 0x0f, 0xbc, 0xc2);
|
|
SINGLE_COMPARE(bswap(eax), 0x0f, 0xc8);
|
|
SINGLE_COMPARE(bswap(r12d), 0x41, 0x0f, 0xcc);
|
|
SINGLE_COMPARE(bswap(rax), 0x48, 0x0f, 0xc8);
|
|
SINGLE_COMPARE(bswap(r12), 0x49, 0x0f, 0xcc);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "LabelLea")
|
|
{
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build)
|
|
{
|
|
Label fn;
|
|
build.lea(rax, fn);
|
|
build.ret();
|
|
|
|
build.setLabel(fn);
|
|
build.ret();
|
|
},
|
|
{0x48, 0x8d, 0x05, 0x01, 0x00, 0x00, 0x00, 0xc3, 0xc3}
|
|
));
|
|
}
|
|
|
|
TEST_CASE("LogTest")
|
|
{
|
|
AssemblyBuilderX64 build(/* logText= */ true);
|
|
|
|
build.push(r12);
|
|
build.align(8);
|
|
build.align(8, AlignmentDataX64::Int3);
|
|
build.align(8, AlignmentDataX64::Ud2);
|
|
|
|
build.add(rax, rdi);
|
|
build.add(rcx, 8);
|
|
build.sub(dword[rax], 0x1fdc);
|
|
build.and_(dword[rcx], 0x37);
|
|
build.mov(rdi, qword[rax + rsi * 2]);
|
|
build.vaddss(xmm0, xmm0, dword[rax + r14 * 2 + 0x1c]);
|
|
|
|
Label start = build.setLabel();
|
|
build.cmp(rsi, rdi);
|
|
build.jcc(ConditionX64::Equal, start);
|
|
build.lea(rcx, start);
|
|
build.lea(rcx, addr[rdx]);
|
|
|
|
build.jmp(qword[rdx]);
|
|
build.vaddps(ymm9, ymm12, ymmword[rbp + 0xc]);
|
|
build.vaddpd(ymm2, ymm7, build.f64(2.5));
|
|
build.neg(qword[rbp + r12 * 2]);
|
|
build.mov64(r10, 0x1234567812345678ll);
|
|
build.vmovapd(xmmword[rax], xmm11);
|
|
build.movzx(eax, byte[rcx]);
|
|
build.movsx(rsi, word[r12]);
|
|
build.imul(rcx, rdx);
|
|
build.imul(rcx, rdx, 8);
|
|
build.vroundsd(xmm1, xmm2, xmm3, RoundingModeX64::RoundToNearestEven);
|
|
build.add(rdx, qword[rcx - 12]);
|
|
build.pop(r12);
|
|
build.cmov(ConditionX64::AboveEqual, rax, rbx);
|
|
build.ret();
|
|
build.int3();
|
|
|
|
build.nop();
|
|
build.nop(2);
|
|
build.nop(3);
|
|
build.nop(4);
|
|
build.nop(5);
|
|
build.nop(6);
|
|
build.nop(7);
|
|
build.nop(8);
|
|
build.nop(9);
|
|
|
|
build.finalize();
|
|
|
|
std::string expected = R"(
|
|
push r12
|
|
; align 8
|
|
nop word ptr[rax+rax] ; 6-byte nop
|
|
; align 8 using int3
|
|
; align 8 using ud2
|
|
add rax,rdi
|
|
add rcx,8
|
|
sub dword ptr [rax],1FDCh
|
|
and dword ptr [rcx],37h
|
|
mov rdi,qword ptr [rax+rsi*2]
|
|
vaddss xmm0,xmm0,dword ptr [rax+r14*2+01Ch]
|
|
.L1:
|
|
cmp rsi,rdi
|
|
je .L1
|
|
lea rcx,.L1
|
|
lea rcx,[rdx]
|
|
jmp qword ptr [rdx]
|
|
vaddps ymm9,ymm12,ymmword ptr [rbp+0Ch]
|
|
vaddpd ymm2,ymm7,qword ptr [.start-8]
|
|
neg qword ptr [rbp+r12*2]
|
|
mov r10,1234567812345678h
|
|
vmovapd xmmword ptr [rax],xmm11
|
|
movzx eax,byte ptr [rcx]
|
|
movsx rsi,word ptr [r12]
|
|
imul rcx,rdx
|
|
imul rcx,rdx,8
|
|
vroundsd xmm1,xmm2,xmm3,8
|
|
add rdx,qword ptr [rcx-0Ch]
|
|
pop r12
|
|
cmovae rax,rbx
|
|
ret
|
|
int3
|
|
nop
|
|
xchg ax, ax ; 2-byte nop
|
|
nop dword ptr[rax] ; 3-byte nop
|
|
nop dword ptr[rax] ; 4-byte nop
|
|
nop dword ptr[rax+rax] ; 5-byte nop
|
|
nop word ptr[rax+rax] ; 6-byte nop
|
|
nop dword ptr[rax] ; 7-byte nop
|
|
nop dword ptr[rax+rax] ; 8-byte nop
|
|
nop word ptr[rax+rax] ; 9-byte nop
|
|
)";
|
|
|
|
CHECK("\n" + build.text == expected);
|
|
}
|
|
|
|
TEST_CASE_FIXTURE(AssemblyBuilderX64Fixture, "Constants")
|
|
{
|
|
// clang-format off
|
|
CHECK(check(
|
|
[](AssemblyBuilderX64& build) {
|
|
build.xor_(rax, rax);
|
|
build.add(rax, build.i64(0x1234567887654321));
|
|
build.vmovss(xmm2, build.f32(1.0f));
|
|
build.vmovsd(xmm3, build.f64(1.0));
|
|
build.vmovaps(xmm4, build.f32x4(1.0f, 2.0f, 4.0f, 8.0f));
|
|
char arr[16] = "hello world!123";
|
|
build.vmovupd(xmm5, build.bytes(arr, 16, 8));
|
|
build.vmovapd(xmm5, build.f64x2(5.0, 6.0));
|
|
build.ret();
|
|
},
|
|
{
|
|
0x48, 0x33, 0xc0,
|
|
0x48, 0x03, 0x05, 0xee, 0xff, 0xff, 0xff,
|
|
0xc4, 0xe1, 0x7a, 0x10, 0x15, 0xe1, 0xff, 0xff, 0xff,
|
|
0xc4, 0xe1, 0x7b, 0x10, 0x1d, 0xcc, 0xff, 0xff, 0xff,
|
|
0xc4, 0xe1, 0x78, 0x28, 0x25, 0xab, 0xff, 0xff, 0xff,
|
|
0xc4, 0xe1, 0x79, 0x10, 0x2d, 0x92, 0xff, 0xff, 0xff,
|
|
0xc4, 0xe1, 0x79, 0x28, 0x2d, 0x79, 0xff, 0xff, 0xff,
|
|
0xc3
|
|
},
|
|
{
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x40,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x40,
|
|
'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', '1', '2', '3', 0x0,
|
|
0x00, 0x00, 0x80, 0x3f,
|
|
0x00, 0x00, 0x00, 0x40,
|
|
0x00, 0x00, 0x80, 0x40,
|
|
0x00, 0x00, 0x00, 0x41,
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // padding to align f32x4
|
|
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f,
|
|
0x00, 0x00, 0x00, 0x00, // padding to align f64
|
|
0x00, 0x00, 0x80, 0x3f,
|
|
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
|
|
}));
|
|
// clang-format on
|
|
}
|
|
|
|
TEST_CASE("ConstantStorage")
|
|
{
|
|
AssemblyBuilderX64 build(/* logText= */ false);
|
|
|
|
for (int i = 0; i <= 3000; i++)
|
|
build.vaddss(xmm0, xmm0, build.i32(i));
|
|
|
|
build.finalize();
|
|
|
|
CHECK(build.data.size() == 12004);
|
|
|
|
for (int i = 0; i <= 3000; i++)
|
|
{
|
|
CHECK(build.data[i * 4 + 0] == ((3000 - i) & 0xff));
|
|
CHECK(build.data[i * 4 + 1] == ((3000 - i) >> 8));
|
|
CHECK(build.data[i * 4 + 2] == 0x00);
|
|
CHECK(build.data[i * 4 + 3] == 0x00);
|
|
}
|
|
}
|
|
|
|
TEST_CASE("ConstantStorageDedup")
|
|
{
|
|
AssemblyBuilderX64 build(/* logText= */ false);
|
|
|
|
for (int i = 0; i <= 3000; i++)
|
|
build.vaddss(xmm0, xmm0, build.f32(1.0f));
|
|
|
|
build.finalize();
|
|
|
|
CHECK(build.data.size() == 4);
|
|
|
|
CHECK(build.data[0] == 0x00);
|
|
CHECK(build.data[1] == 0x00);
|
|
CHECK(build.data[2] == 0x80);
|
|
CHECK(build.data[3] == 0x3f);
|
|
}
|
|
|
|
TEST_CASE("ConstantCaching")
|
|
{
|
|
AssemblyBuilderX64 build(/* logText= */ false);
|
|
|
|
OperandX64 two = build.f64(2);
|
|
|
|
// Force data relocation
|
|
for (int i = 0; i < 4096; i++)
|
|
build.f64(i);
|
|
|
|
CHECK(build.f64(2).imm == two.imm);
|
|
|
|
build.finalize();
|
|
}
|
|
|
|
TEST_SUITE_END();
|