algorithm - 解析 x86 机器代码指令的过程
问题描述
我一直在考虑反汇编器以及如何从机器代码回到汇编(或机器代码回到某种可以像在 VM 中评估的中间形式)。这使我想到了xed,这是一个不错的项目,但它非常复杂且难以理解。我找到了我正在寻找的粗略代码,它基本上可以归结为:
decode(decoder) {
prefix_scanner(decoder)
opcode_scanner(decoder)
modrm_scanner(decoder)
sib_scanner(decoder)
disp_scanner(decoder)
imm_scanner(decoder)
}
这暗示了将指令输入位解析为某种结构或对象的基本过程。
这个甚至可能更好,但它没有太多的活动,尽管它说它是彻底和完整的。他们有这个功能(一种独特实现的代码,一切都是全局变量......看起来更像是一个演示而不是一个模块):
function DecodeInstruction()
{
//Reset Prefix adjustments, and vector setting adjustments.
Reset();
var out = ""; //The instruction code that will be returned back from this function.
//Record the starting position.
InstructionPos = GetPosition();
//First read any opcodes (prefix) that act as adjustments to the main three operand decode functions ^DecodeRegValue()^,
//^Decode_ModRM_SIB_Address()^, and ^DecodeImmediate()^.
DecodePrefixAdjustments();
//Only continue if an invalid opcode is not read by DecodePrefixAdjustments() for cpu bit mode setting.
if( !InvalidOp )
{
//Decode the instruction.
DecodeOpcode();
//-------------------------------------------------------------------------------------------------------------------------
//Intel Larrabee CCCCC condition codes.
//-------------------------------------------------------------------------------------------------------------------------
if( Opcode >= 0x700 && Instruction.slice(-1) === "," )
{
Instruction = Instruction.split(",");
//CMP conditions.
if( Opcode >= 0x720 && Opcode <= 0x72F )
{
IMMValue = VectorRegister >> 2;
if( Float || ( IMMValue !== 3 && IMMValue !== 7 ) )
{
Instruction = Instruction[0] + ConditionCodes[IMMValue] + Instruction[1];
}
else { Instruction = Instruction[0] + Instruction[1]; }
IMMValue = 0; VectorRegister &= 0x03;
}
//Else High/Low.
else
{
Instruction = Instruction[0] + ( ( ( VectorRegister & 1 ) === 1 ) ? "H" : "L" ) + Instruction[1];
}
}
//Setup the X86 Decoder for which operands the instruction uses.
DecodeOperandString();
//Now only some instructions can vector extend, and that is only if the instruction is an SIMD Vector format instruction.
if( !Vect && Extension > 0 && Opcode <= 0x400 ) { InvalidOp = true; }
//The Width Bit setting must match the vector numbers size otherwise this create an invalid operation code in MVEX/EVEX unless the Width bit is ignored.
if( Vect && !IgnoresWidthbit && Extension >= 2 )
{
InvalidOp = ( ( SIMD & 1 ) !== ( WidthBit & 1 ) ); //Note use, and ignore width bit pastern EVEX.
}
if( Opcode >= 0x700 ) { WidthBit ^= IgnoresWidthbit; } //L1OM Width bit invert.
}
//If the instruction is invalid then set the instruction to "???"
if( InvalidOp )
{
out = "???" //set the returned instruction to invalid
}
//Else finish decoding the valid instruction.
else
{
//Decode each operand along the Decoder array in order, and deactivate them.
DecodeOperands();
/*-------------------------------------------------------------------------------------------------------------------------
3DNow Instruction name is encoded by the next byte after the ModR/M, and Reg operands.
-------------------------------------------------------------------------------------------------------------------------*/
if( Opcode === 0x10F )
{
//Lookup operation code.
Instruction = M3DNow[ BinCode[CodePos] ]; NextByte();
//If Invalid instruction.
if( Instruction === "" || Instruction == null )
{
Instruction = "???"; InsOperands = "";
}
}
/*-------------------------------------------------------------------------------------------------------------------------
Synthetic virtual machine operation codes.
-------------------------------------------------------------------------------------------------------------------------*/
else if( Instruction === "SSS" )
{
//The Next two bytes after the static opcode is the select synthetic virtual machine operation code.
var Code1 = BinCode[CodePos]; NextByte();
var Code2 = BinCode[CodePos]; NextByte();
//No operations exist past 4 in value for both bytes that combine to the operation code.
if( Code1 >= 5 || Code2 >= 5 ) { Instruction = "???"; }
//Else calculate the operation code in the 5x5 map.
else
{
Instruction = MSynthetic[ ( Code1 * 5 ) + Code2 ];
//If Invalid instruction.
if( Instruction === "" || Instruction == null )
{
Instruction = "???";
}
}
}
//32/16 bit instructions 9A, and EA use Segment, and offset with Immediate format.
if( Opcode === 0x9A || Opcode === 0xEA )
{
var t = InsOperands.split(",");
InsOperands = t[1] + ":" +t[0];
}
//**Depending on the operation different prefixes replace others for HLE, or MPX, and branch prediction.
//if REP prefix, and LOCK prefix are used together, and the current decoded operation allows HLE XRELEASE.
if(PrefixG1 === Mnemonics[0xF3] && PrefixG2 === Mnemonics[0xF0] && XRelease)
{
PrefixG1 = "XRELEASE"; //Then change REP to XRELEASE.
}
//if REPNE prefix, and LOCK prefix are used together, and the current decoded operation allows HLE XACQUIRE.
if(PrefixG1 === Mnemonics[0xF2] && PrefixG2 === Mnemonics[0xF0] && XAcquire)
{
PrefixG1 = "XACQUIRE"; //Then change REP to XACQUIRE
}
//Depending on the order that the Repeat prefix, and Lock prefix is used flip Prefix G1, and G2 if HLEFlipG1G2 it is true.
if((PrefixG1 === "XRELEASE" || PrefixG1 === "XACQUIRE") && HLEFlipG1G2)
{
t = PrefixG1; PrefixG1 = PrefixG2; PrefixG2 = t;
}
//if HT is active then it is a jump instruction check and adjust for the HT,and HNT prefix.
if(HT)
{
if (SegOverride === Mnemonics[0x2E])
{
PrefixG1 = "HNT";
}
else if (SegOverride === Mnemonics[0x3E])
{
PrefixG1 = "HT";
}
}
//else if Prefix is REPNE switch it to BND if operation is a MPX instruction.
if(PrefixG1 === Mnemonics[0xF2] && BND)
{
PrefixG1 = "BND";
}
//Before the Instruction is put together check the length of the instruction if it is longer than 15 bytes the instruction is undefined.
if ( InstructionHex.length > 30 )
{
//Calculate how many bytes over.
var Dif32 = ( ( InstructionHex.length - 30 ) >> 1 );
//Limit the instruction hex output to 15 bytes.
InstructionHex = InstructionHex.substring( 0, 30 );
//Calculate the Difference between the Disassembler current position.
Dif32 = Pos32 - Dif32;
//Convert Dif to unsignified numbers.
if( Dif32 < 0 ) { Dif32 += 0x100000000; }
//Convert to strings.
for (var S32 = Dif32.toString(16) ; S32.length < 8; S32 = "0" + S32);
for (var S64 = Pos64.toString(16) ; S64.length < 8; S64 = "0" + S64);
//Go to the Calculated address right after the Instruction UD.
GotoPosition( S64 + S32 );
//Set prefixes, and operands to empty strings, and set Instruction to UD.
PrefixG1 = "";PrefixG2 = ""; Instruction = "???"; InsOperands = "";
}
//Put the Instruction sequence together.
out = PrefixG1 + " " + PrefixG2 + " " + Instruction + " " + InsOperands;
//Remove any trailing spaces because of unused prefixes.
out = out.replace(/^[ ]+|[ ]+$/g,'');
//Add error suppression if used.
if( Opcode >= 0x700 || RoundMode !== 0 )
{
out += RoundModes[ RoundMode ];
}
//Return the instruction.
}
return( out );
}
我的第一个切线问题是,他们怎么知道如何做到这一点?我没有在英特尔手册中看到任何实现反汇编程序的算法。而且英特尔手册似乎没有任何直接的数据可用于实现其各种表等的代码。因此,在我看来,您必须做大量工作才能将英特尔手册总结为精髓,发现它的各个方面。然后你可以开始考虑实现一个反汇编程序。有没有更简单的方法?我想看看如何实现反汇编程序,但首先想在指令和操作数等方面找到一个很好的计算机可读数据来源,这很好而且很干净,可以在这个过程中提供帮助。因此我对 xed 感兴趣及其数据文件。但我不认为 xed 会起作用,它似乎太乱了。
但主要问题是,什么是实现指令反汇编(从机器代码转换为其他任何东西、某种数据结构,甚至只是发出令牌或其他东西)的伪代码过程(或者甚至更好的 JavaScript 过程)?上面的 JavaScript 是否足够好,还是有更直接的实现?知道在伪代码中会是什么样子可能会使事情更容易理解。
我主要想知道有两个原因。首先,我想实现一个反汇编器和一个生成器。其次,我看不到如何有效地解析字节(和字节位),并且不需要您前后扫描以找出 1-15 的边界字节指令。我还没有在脑海中看到如何轻松判断指令的开始和结束位置,因此通过伪代码演示显示该过程将使其可见,并且更容易思考如何编写真正的解析器。