x86-64框架是一种兼容x86指令集的64位微处理器架构。
从逆向工程的角度来说,这种框架的主要区别在于:
除了位于FPU和SIMD的寄存器之外,几乎所有的寄存器都变为64位寄存器。所有指令都可以通过R-字头的助记符(寄存器名称)调用64位寄存器。而且x86-64框架的CPU要比x86框架的CPU多出8个通用寄存器。这些通用寄存器分别是:RAX、RBX、RCX、RDX、RBP、RSP、RSI、RDI、R8、R9、R10、R11、R12、R13、R14、R15。
它向下兼容,允许程序使用GPR的32位寄存器的助记符、操作该寄存器的低32位数据。例如说,程序可以通过助记符EAX访问RAX寄存器的低32位。
7th(字节号)
|
6th
|
5th
|
4th
|
3rd
|
2nd
|
1st
|
0th
|
RAXx64
|
|||||||
EAX
|
|||||||
AX
|
|||||||
AH
|
AL
|
64位寄存器特有的R8~R15寄存器不仅有相应的(低)32位助记符,即R8D~R15D(低32位),而且还有相应的(低)16位助记符R8W~R15W。
7th(字节号)
|
6th
|
5th
|
4th
|
3rd
|
2nd
|
1st
|
0th
|
R8
|
|||||||
R8D
|
|||||||
R8W
|
|||||||
R8L
|
x86-64框架的CPU有16个SIMD寄存器,即XMM0~XMM15。它的SIMD寄存器比x86 CPU多出一倍。
Win64系统的函数调用约定发生了相应的变化,采取了与fastcall规范(请参见本书64.3节)相似的参数传递规范。这类程序使用RCX、RDX、R8、R9寄存器传递函数所需的前4个参数,并用栈传递其余参数。为了保证被调用方函数能够使用前4个参数所占的寄存器,调用方函数会单独分配出来32字节的栈空间,以便被调用方函数保存4个参数寄存器的原始状态。虽然小型函数可能不会用到多少寄存器,但是大型函数可能就要把传入的参数保存到栈里,以尽量充分使用寄存器资源。
System V AMD64 ABI(泛指Linux、各种BSD和Mac OS X)(参见Mit13)系统的函数调用约定也与fastcall规范相似。它使用RDI、RSI、RDX、RCX、R8、R9这6个寄存器传递前6个参数,再使用栈来传递其余参数。
本书的第64章详细介绍了各种调用约定。
在C/C++语言里,x86-64平台的int型数据仍然是32位数据。
x86-64平台的指针都是64位指针。
不过实际情况并非那么理想,x64 CPU在外部RAM的寻址能力只有48位。但是存储指针、包括缓存区的开销却高出x86一倍。
因为64位平台的寄存器数量是32位平台的两倍,编译器在调动资源方面的底气也更足一些,所以它们的寄存器分配方案(register allocation)当然会不同。这就是说,64位应用程序中的局部变量(栈空间)会更少一些。
第25章介绍过一个用bitslice DES算法实现并行计算的源程序。我们对其稍加修改,让程序根据DES_type数据类型(uint32、uint64、SSE2或AVX)处理32/64/128/256位数据的S-Box。
/*
* Generated S-box files.
*
* This software may be modified, redistributed, and used for any purpose,
* so long as its origin is acknowledged.
*
* Produced by Matthew Kwan - March 1998
*/
#ifdef _WIN64
#define DES_type unsigned __int64
#else
#define DES_type unsigned int
#endif
void
s1 (
DES_type a1,
DES_type a2,
DES_type a3,
DES_type a4,
DES_type a5,
DES_type a6,
DES_type *out1,
DES_type *out2,
DES_type *out3,
DES_type *out4
){
DES_type x1, x2, x3, x4, x5, x6, x7, x8;
DES_type x9, x10, x11, x12, x13, x14, x15, x16;
DES_type x17, x18, x19, x20, x21, x22, x23, x24;
DES_type x25, x26, x27, x28, x29, x30, x31, x32;
DES_type x33, x34, x35, x36, x37, x38, x39, x40;
DES_type x41, x42, x43, x44, x45, x46, x47, x48;
DES_type x49, x50, x51, x52, x53, x54, x55, x56;
x1 = a3 & ~a5;
x2 = x1 ^ a4;
x3 = a3 & ~a4;
x4 = x3 | a5;
x5 = a6 & x4;
x6 = x2 ^ x5;
x7 = a4 & ~a5;
x8 = a3 ^ a4;
x9 = a6 & ~x8;
x10 = x7 ^ x9;
x11 = a2 | x10;
x12 = x6 ^ x11;
x13 = a5 ^ x5;
x14 = x13 & x8;
x15 = a5 & ~a4;
x16 = x3 ^ x14;
x17 = a6 | x16;
x18 = x15 ^ x17;
x19 = a2 | x18;
x20 = x14 ^ x19;
x21 = a1 & x20;
x22 = x12 ^ ~x21;
*out2 ^= x22;
x23 = x1 | x5;
x24 = x23 ^ x8;
x25 = x18 & ~x2;
x26 = a2 & ~x25;
x27 = x24 ^ x26;
x28 = x6 | x7;
x29 = x28 ^ x25;
x30 = x9 ^ x24;
x31 = x18 & ~x30;
x32 = a2 & x31;
x33 = x29 ^ x32;
x34 = a1 & x33;
x35 = x27 ^ x34;
*out4 ^= x35;
x36 = a3 & x28;
x37 = x18 & ~x36;
x38 = a2 | x3;
x39 = x37 ^ x38;
x40 = a3 | x31;
x41 = x24 & ~x37;
x42 = x41 | x3;
x43 = x42 & ~a2;
x44 = x40 ^ x43;
x45 = a1 & ~x44;
x46 = x39 ^ ~x45;
*out1 ^= x46;
x47 = x33 & ~x9;
x48 = x47 ^ x39;
x49 = x4 ^ x36;
x50 = x49 & ~x5;
x51 = x42 | x18;
x52 = x51 ^ a5;
x53 = a2 & ~x52;
x54 = x50 ^ x53;
x55 = a1 | x54;
x56 = x48 ^ ~x55;
*out3 ^= x56;
}
这个源程序里有很多局部变量。当然并非所有的局部变量都要存储到栈里。我们使用32位的MSVC 2008(启用/Ox选项)编译它,可得到如下所示的代码。
指令清单26.1 Optimizing MSVC 2008
PUBLIC _s1
; Function compile flags: /Ogtpy
_TEXT SEGMENT
_x6$ = -20 ; size = 4
_x3$ = -16 ; size = 4
_x1$ = -12 ; size = 4
_x8$ = -8 ; size = 4
_x4$ = -4 ; size = 4
_a1$ = 8 ; size = 4
_a2$ = 12 ; size = 4
_a3$ = 16 ; size = 4
_x33$ = 20 ; size = 4
_x7$ = 20 ; size = 4
_a4$ = 20 ; size = 4
_a5$ = 24 ; size = 4
tv326 = 28 ; size = 4
_x36$ = 28 ; size = 4
_x28$ = 28 ; size = 4
_a6$ = 28 ; size = 4
_out1$ = 32 ; size = 4
_x24$ = 36 ; size = 4
_out2$ = 36 ; size = 4
_out3$ = 40 ; size = 4
_out4$ = 44 ; size = 4
_s1 PROC
sub esp, 20 ; 00000014H
mov edx, DWORD PTR _a5$[esp+16]
push ebx
mov ebx, DWORD PTR _a4$[esp+20]
push ebp
push esi
mov esi, DWORD PTR _a3$[esp+28]
push edi
mov edi, ebx
not edi
mov ebp, edi
and edi, DWORD PTR _a5$[esp+32]
mov ecx, edx
not ecx
and ebp, esi
mov eax, ecx
and eax, esi
and ecx, ebx
mov DWORD PTR _x1$[esp+36], eax
xor eax, ebx
mov esi, ebp
or esi, edx
mov DWORD PTR _x4$[esp+36], esi
and esi, DWORD PTR _a6$[esp+32]
mov DWORD PTR _x7$[esp+32], ecx
mov edx, esi
xor edx, eax
mov DWORD PTR _x6$[esp+36], edx
mov edx, DWORD PTR _a3$[esp+32]
xor edx, ebx
mov ebx, esi
xor ebx, DWORD PTR _a5$[esp+32]
mov DWORD PTR _x8$[esp+36], edx
and ebx, edx
mov ecx, edx
mov edx, ebx
xor edx, ebp
or edx, DWORD PTR _a6$[esp+32]
not ecx
and ecx, DWORD PTR _a6$[esp+32]
xor edx, edi
mov edi, edx
or edi, DWORD PTR _a2$[esp+32]
mov DWORD PTR _x3$[esp+36], ebp
mov ebp, DWORD PTR _a2$[esp+32]
xor edi, ebx
and edi, DWORD PTR _a1$[esp+32]
mov ebx, ecx
xor ebx, DWORD PTR _x7$[esp+32]
not edi
or ebx, ebp
xor edi, ebx
mov ebx, edi
mov edi, DWORD PTR _out2$[esp+32]
xor ebx, DWORD PTR [edi]
not eax
xor ebx, DWORD PTR _x6$[esp+36]
and eax, edx
mov DWORD PTR [edi], ebx
mov ebx, DWORD PTR _x7$[esp+32]
or ebx, DWORD PTR _x6$[esp+36]
mov edi, esi
or edi, DWORD PTR _x1$[esp+36]
mov DWORD PTR _x28$[esp+32], ebx
xor edi, DWORD PTR _x8$[esp+36]
mov DWORD PTR _x24$[esp+32], edi
xor edi, ecx
not edi
and edi, edx
mov ebx, edi
and ebx, ebp
xor ebx, DWORD PTR _x28$[esp+32]
xor ebx, eax
not eax
mov DWORD PTR _x33$[esp+32], ebx
and ebx, DWORD PTR _a1$[esp+32]
and eax, ebp
xor eax, ebx
mov ebx, DWORD PTR _out4$[esp+32]
xor eax, DWORD PTR [ebx]
xor eax, DWORD PTR _x24$[esp+32]
mov DWORD PTR [ebx], eax
mov eax, DWORD PTR _x28$[esp+32]
and eax, DWORD PTR _a3$[esp+32]
mov ebx, DWORD PTR _x3$[esp+36]
or edi, DWORD PTR _a3$[esp+32]
mov DWORD PTR _x36$[esp+32], eax
not eax
and eax, edx
or ebx, ebp
xor ebx, eax
not eax
and eax, DWORD PTR _x24$[esp+32]
not ebp
or eax, DWORD PTR _x3$[esp+36]
not esi
and ebp, eax
or eax, edx
xor eax, DWORD PTR _a5$[esp+32]
mov edx, DWORD PTR _x36$[esp+32]
xor edx, DWORD PTR _x4$[esp+36]
xor ebp, edi
mov edi, DWORD PTR _out1$[esp+32]
not eax
and eax, DWORD PTR _a2$[esp+32]
not ebp
and ebp, DWORD PTR _a1$[esp+32]
and edx, esi
xor eax, edx
or eax, DWORD PTR _a1$[esp+32]
not ebp
xor ebp, DWORD PTR [edi]
not ecx
and ecx, DWORD PTR _x33$[esp+32]
xor ebp, ebx
not eax
mov DWORD PTR [edi], ebp
xor eax, ecx
mov ecx, DWORD PTR _out3$[esp+32]
xor eax, DWORD PTR [ecx]
pop edi
pop esi
xor eax, ebx
pop ebp
mov DWORD PTR [ecx], eax
pop ebx
add esp, 20 ; 00000014H
ret 0
_s1 ENDP
编译器只在栈里存储了5个变量。
作为对比,我们再使用64位的MSVC 2008编译它,得到的代码如下所示。
指令清单26.2 Optimizing MSVC 2008
a1$ = 56
a2$ = 64
a3$ = 72
a4$ = 80
x36$1$ = 88
a5$ = 88
a6$ = 96
out1$ = 104
out2$ = 112
out3$ = 120
out4$ = 128
s1 PROC
$LN3:
mov QWORD PTR [rsp+24], rbx
mov QWORD PTR [rsp+32], rbp
mov QWORD PTR [rsp+16], rdx
mov QWORD PTR [rsp+8], rcx
push rsi
push rdi
push r12
push r13
push r14
push r15
mov r15, QWORD PTR a5$[rsp]
mov rcx, QWORD PTR a6$[rsp]
mov rbp, r8
mov r10, r9
mov rax, r15
mov rdx, rbp
not rax
xor rdx, r9
not r10
mov r11, rax
and rax, r9
mov rsi, r10
mov QWORD PTR x36$1$[rsp], rax
and r11, r8
and rsi, r8
and r10, r15
mov r13, rdx
mov rbx, r11
xor rbx, r9
mov r9, QWORD PTR a2$[rsp]
mov r12, rsi
or r12, r15
not r13
and r13, rcx
mov r14, r12
and r14, rcx
mov rax, r14
mov r8, r14
xor r8, rbx
xor rax, r15
not rbx
and rax, rdx
mov rdi, rax
xor rdi, rsi
or rdi, rcx
xor rdi, r10
and rbx, rdi
mov rcx, rdi
or rcx, r9
xor rcx, rax
mov rax, r13
xor rax, QWORD PTR x36$1$[rsp]
and rcx, QWORD PTR a1$[rsp]
or rax, r9
not rcx
xor rcx, rax
mov rax, QWORD PTR out2$[rsp]
xor rcx, QWORD PTR [rax]
xor rcx, r8
mov QWORD PTR [rax], rcx
mov rax, QWORD PTR x36$1$[rsp]
mov rcx, r14
or rax, r8
or rcx, r11
mov r11, r9
xor rcx, rdx
mov QWORD PTR x36$1$[rsp], rax
mov r8, rsi
mov rdx, rcx
xor rdx, r13
not rdx
and rdx, rdi
mov r10, rdx
and r10, r9
xor r10, rax
xor r10, rbx
not rbx
and rbx, r9
mov rax, r10
and rax, QWORD PTR a1$[rsp]
xor rbx, rax
mov rax, QWORD PTR out4$[rsp]
xor rbx, QWORD PTR [rax]
xor rbx, rcx
mov QWORD PTR [rax], rbx
mov rbx, QWORD PTR x36$1$[rsp]
and rbx, rbp
mov r9, rbx
not r9
and r9, rdi
or r8, r11
mov rax, QWORD PTR out1$[rsp]
xor r8, r9
not r9
and r9, rcx
or rdx, rbp
mov rbp, QWORD PTR [rsp+80]
or r9, rsi
xor rbx, r12
mov rcx, r11
not rcx
not r14
not r13
and rcx, r9
or r9, rdi
and rbx, r14
xor r9, r15
xor rcx, rdx
mov rdx, QWORD PTR a1$[rsp]
not r9
not rcx
and r13, r10
and r9, r11
and rcx, rdx
xor r9, rbx
mov rbx, QWORD PTR [rsp+72]
not rcx
xor rcx, QWORD PTR [rax]
or r9, rdx
not r9
xor rcx, r8
mov QWORD PTR [rax], rcx
mov rax, QWORD PTR out3$[rsp]
xor r9, r13
xor r9, QWORD PTR [rax]
xor r9, r8
mov QWORD PTR [rax], r9
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
ret 0
s1 ENDP
编译器根本没有使用栈。上述代码中的x36就是a5。
顺便说一下,x86-64 CPU的GPR还算不上是最多的。例如,Itanium/安腾处理器就有128个GPR。
自ARM v8开始,ARM处理器支持64位指令。
有关64位平台的浮点数运算,请参见本书第27章。