1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
12 using namespace llvm::object
;
13 using std::error_code
;
28 Disassembler
* createLLVMDisassembler(const std::string
& filename
, InformationManager
* manager
) {
32 std::unique_ptr
<Binary
> o
;
33 o
.reset(createBinary(filename
).get());
34 Binary
* op
= o
.release();
36 // ELFType<endian, maxalign, 64bit>
37 if (ELF32LEObjectFile
* object
= dyn_cast
<ELF32LEObjectFile
>(op
)) {
38 return new LLVMDisassembler
<ELFType
<support::little
, 2, false>>(filename
, manager
, object
);
40 if (ELF64LEObjectFile
* object
= dyn_cast
<ELF64LEObjectFile
>(op
)) {
41 return new LLVMDisassembler
<ELFType
<support::little
, 2, true>>(filename
, manager
, object
);
43 if (ELF32BEObjectFile
* object
= dyn_cast
<ELF32BEObjectFile
>(op
)) {
44 return new LLVMDisassembler
<ELFType
<support::big
, 2, false>>(filename
, manager
, object
);
46 if (ELF64BEObjectFile
* object
= dyn_cast
<ELF64BEObjectFile
>(op
)) {
47 return new LLVMDisassembler
<ELFType
<support::big
, 2, true>>(filename
, manager
, object
);
49 if (COFFObjectFile
* object
= dyn_cast
<COFFObjectFile
>(op
)) {
50 return new LLVMDisassembler
<COFFT
>(filename
, manager
, object
);
52 if (MachOObjectFile
* object
= dyn_cast
<MachOObjectFile
>(op
)) {
53 return new LLVMDisassembler
<MACHOT
>(filename
, manager
, object
);
60 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
61 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
64 template <typename ELFT
>
65 LLVMDisassembler
<ELFT
>::LLVMDisassembler(const std::string
& filename
,
66 InformationManager
* manager
,
69 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
70 , triple("unknown-unknown-unknown")
73 LOG4CXX_DEBUG(logger
, "Handling file " << filename
);
76 auto result
= createBinary(filename
);
79 if ((ec
= result
.getError())) {
80 LOG4CXX_ERROR(logger
, "Failed to load Binary" << ec
.message());
85 binary
.reset(result
.get());
87 o
= dyn_cast
<ObjectFile
>(binary
.get());
93 triple
.setArch(Triple::ArchType(o
->getArch()));
94 std::string
tripleName(triple
.getTriple());
96 LOG4CXX_INFO(logger
, "Architecture " << tripleName
);
100 target
= TargetRegistry::lookupTarget("", triple
, es
);
102 LOG4CXX_ERROR(logger
, es
);
106 LOG4CXX_INFO(logger
, "Target " << target
->getName());
108 MRI
.reset(target
->createMCRegInfo(tripleName
));
110 LOG4CXX_ERROR(logger
, "no register info for target " << tripleName
);
114 // Set up disassembler.
115 AsmInfo
.reset(target
->createMCAsmInfo(*MRI
, tripleName
));
117 LOG4CXX_ERROR(logger
, "no assembly info for target " << tripleName
);
121 STI
.reset(target
->createMCSubtargetInfo(tripleName
, "", ""));
123 LOG4CXX_ERROR(logger
, "no subtarget info for target " << tripleName
);
127 MII
.reset(target
->createMCInstrInfo());
129 LOG4CXX_ERROR(logger
, "no instruction info for target " << tripleName
);
133 MOFI
.reset(new MCObjectFileInfo
);
134 MCContext
Ctx(AsmInfo
.get(), MRI
.get(), MOFI
.get());
136 DisAsm
.reset(target
->createMCDisassembler(*STI
, Ctx
));
138 LOG4CXX_ERROR(logger
, "no disassembler for target " << tripleName
);
142 target
->createMCRelocationInfo(tripleName
, Ctx
));
145 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
147 // DisAsm->setSymbolizer(std::move(Symzer));
152 MIA
.reset(target
->createMCInstrAnalysis(MII
.get()));
154 LOG4CXX_ERROR(logger
, "no instruction analysis for target " << tripleName
);
158 int AsmPrinterVariant
= AsmInfo
->getAssemblerDialect();
159 IP
.reset(target
->createMCInstPrinter(AsmPrinterVariant
, *AsmInfo
, *MII
, *MRI
, *STI
));
161 LOG4CXX_ERROR(logger
, "no instruction printer for target " << tripleName
);
165 IP
->setPrintImmHex(llvm::HexStyle::C
);
166 IP
->setPrintImmHex(true);
168 std::unique_ptr
<MCObjectDisassembler
> OD(
169 new MCObjectDisassembler(*o
, *DisAsm
, *MIA
));
170 Mod
.reset(OD
->buildModule(false));
175 template <typename ELFT
>
176 void LLVMDisassembler
<ELFT
>::start() {
179 readDynamicSymbols();
182 template <typename ELFT
>
183 LLVMDisassembler
<ELFT
>::~LLVMDisassembler() {}
185 template <typename ELFT
>
186 Function
* LLVMDisassembler
<ELFT
>::disassembleFunctionAt(uint64_t address
, const std::string
& name
) {
188 SectionRef text_section
= getTextSection();
189 uint64_t base_address
, size
;
190 text_section
.getAddress(base_address
);
191 text_section
.getSize(size
);
193 if (address
< base_address
||
194 address
>= base_address
+ size
) {
198 if (NULL
== (function
= manager
->getFunction(address
))) {
202 s
<< "<Unnamed 0x" << std::hex
<< address
<< ">";
203 function
= manager
->newFunction(address
);
204 function
->setName(s
.str());
206 function
= manager
->newFunction(address
);
207 function
->setName(name
);
209 disassembleFunction(function
);
215 template <typename ELFT
>
216 void LLVMDisassembler
<ELFT
>::disassembleFunction(Function
* function
) {
217 std::vector
<uint64_t> called_functions
;
218 std::stack
<BasicBlock
*> remaining_blocks
;
220 * Do all blocks get added properly? We should take care to remove
221 * the other ones at the end of the function!
223 std::map
<uint64_t, BasicBlock
*> new_blocks
;
224 SectionRef text_section
= getTextSection();
226 text_section
.getContents(bytes
);
227 StringRefMemoryObject
ref(bytes
);
229 LOG4CXX_DEBUG(logger
, "Handling function " << function
->getName());
231 BasicBlock
* block
= manager
->newBasicBlock(function
->getStartAddress());
232 remaining_blocks
.push(block
);
233 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
234 function
->addBasicBlock(block
);
236 uint64_t base_address
, size
;
237 text_section
.getAddress(base_address
);
238 text_section
.getSize(size
);
239 LOG4CXX_DEBUG(logger
, "Text section at " << std::hex
<< base_address
<< " with size " << size
);
241 while (remaining_blocks
.size()) {
242 BasicBlock
* current_block
= remaining_blocks
.top();
243 remaining_blocks
.pop();
245 LOG4CXX_DEBUG(logger
, "Handling Block starting at " << std::hex
246 << current_block
->getStartAddress());
249 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
253 llvm::raw_string_ostream
s(buf
);
255 if(llvm::MCDisassembler::Success
==
256 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
259 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
260 jmptarget
+= base_address
;
261 if (!MIA
->isIndirectBranch(inst
)) {
262 if (MIA
->isCall(inst
)) {
263 if (NULL
== manager
->getFunction(jmptarget
))
264 called_functions
.push_back(jmptarget
);
266 current_block
->setNextBlock(0, jmptarget
);
267 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
268 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
270 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
271 function
->addBasicBlock(block
);
272 remaining_blocks
.push(block
);
274 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
275 << current_block
->getStartAddress());
276 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
278 if (MIA
->isConditionalBranch(inst
)) {
279 jmptarget
= base_address
+ current_address
+ inst_size
;
280 current_block
->setNextBlock(1, jmptarget
);
281 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
282 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
284 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
285 function
->addBasicBlock(block
);
286 remaining_blocks
.push(block
);
288 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
289 << current_block
->getStartAddress());
290 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
301 if (inst_size
== 0 || MIA
->isTerminator(inst
) || MIA
->isBranch(inst
)) {
302 current_block
->setEndAddress(current_address
+ base_address
+ inst_size
);
303 LOG4CXX_DEBUG(logger
, "Finished Block at " << std::hex
<<
304 current_block
->getEndAddress());
307 current_address
+= inst_size
;
310 splitBlocks(function
);
311 LOG4CXX_DEBUG(logger
, "Finished function " << function
->getName());
312 manager
->finishFunction(function
);
313 for (uint64_t address
: called_functions
)
314 disassembleFunctionAt(address
);
317 template <typename ELFT
>
318 void LLVMDisassembler
<ELFT
>::disassemble() {
319 SectionRef text_section
= getTextSection();
320 std::vector
<Function
*> remaining_functions
;
322 // Assume all function symbols actually start a real function
323 for (auto x
= symbols
.begin(); x
!= symbols
.end(); ++x
) {
326 SymbolRef::Type symbol_type
;
329 if (text_section
.containsSymbol(x
->second
, contains
) || !contains
)
332 if (x
->second
.getType(symbol_type
)
333 || SymbolRef::ST_Function
!= symbol_type
)
336 if (!x
->second
.getAddress(result
)) {
337 Function
* fun
= manager
->newFunction(result
);
339 fun
->setName(x
->first
);
340 remaining_functions
.push_back(fun
);
341 LOG4CXX_DEBUG(logger
, "Disasembling " << x
->first
);
343 LOG4CXX_DEBUG(logger
, "Function at " << std::hex
<< result
344 << " already disassembled as " << manager
->getFunction(result
)->getName());
349 for (Function
* function
: remaining_functions
) {
350 disassembleFunction(function
);
351 manager
->finishFunction(function
);
354 if (binary
->isELF()) {
355 uint64_t _entryAddress
= entryAddress();
356 LOG4CXX_DEBUG(logger
, "Adding entryAddress at: " << std::hex
<< _entryAddress
);
358 s
<< "<_start 0x" << std::hex
<< _entryAddress
<< ">";
360 disassembleFunctionAt(_entryAddress
, s
.str());
363 if (!manager
->hasFunctions()) {
365 text_section
.getAddress(text_entry
);
366 LOG4CXX_INFO(logger
, "No Symbols found, starting at the beginning of the text segment");
367 disassembleFunctionAt(text_entry
);
372 uint64_t LLVMDisassembler
<COFFT
>::entryAddress() {
373 const auto coffobject
= dyn_cast
<COFFObjectFile
>(o
);
374 const struct pe32_header
* pe32_header
;
375 const struct pe32plus_header
* pe32plus_header
;
377 coffobject
->getPE32PlusHeader(pe32plus_header
);
379 if (pe32plus_header
) {
380 return pe32plus_header
->AddressOfEntryPoint
;
382 coffobject
->getPE32Header(pe32_header
);
383 return pe32_header
->AddressOfEntryPoint
;
388 uint64_t LLVMDisassembler
<MACHOT
>::entryAddress() {
393 template <typename ELFT
>
394 uint64_t LLVMDisassembler
<ELFT
>::entryAddress() {
395 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
396 const auto * header
= elffile
->getHeader();
398 return header
->e_entry
;
401 template <typename ELFT
>
402 void LLVMDisassembler
<ELFT
>::splitBlocks(Function
* function
) {
403 SectionRef text_section
= getTextSection();
405 text_section
.getContents(bytes
);
406 StringRefMemoryObject
ref(bytes
);
408 LOG4CXX_DEBUG(logger
, "Splitting Blocks in Function " << function
->getName());
409 // Split blocks where jumps are going inside the block
410 for (auto it
= function
->blocks().begin();
411 it
!= function
->blocks().end();
413 BasicBlock
* current_block
= it
->second
;
414 if (current_block
->getEndAddress() == 0) {
415 LOG4CXX_ERROR(logger
, "UNFINISHED BLOCK " << std::hex
<< current_block
->getStartAddress());
419 uint64_t base_address
;
420 text_section
.getAddress(base_address
);
421 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
422 while(current_block
->getEndAddress() - base_address
> current_address
) {
425 llvm::raw_string_ostream
s(buf
);
427 if(llvm::MCDisassembler::Success
==
428 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
429 // See if some other block starts here
430 BasicBlock
* other
= manager
->getBasicBlock(current_address
434 // Special case, other block starts here but we are at the end anyway
436 uint64_t endaddress
= current_address
+ inst_size
+ base_address
;
437 if (endaddress
!= current_block
->getEndAddress()) {
438 LOG4CXX_DEBUG(logger
, "Shortening block starting at "
440 << current_block
->getStartAddress()
442 << other
->getStartAddress());
443 function
->addBasicBlock(other
);
444 current_block
->setEndAddress(endaddress
);
445 current_block
->setNextBlock(0, other
->getStartAddress());
446 current_block
->setNextBlock(1, 0);
452 current_address
+= inst_size
;
458 void LLVMDisassembler
<COFFT
>::readDynamicSymbols() {
463 void LLVMDisassembler
<MACHOT
>::readDynamicSymbols() {
467 template <typename ELFT
>
468 void LLVMDisassembler
<ELFT
>::readDynamicSymbols() {
469 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
470 for (auto it
= elffile
->begin_dynamic_symbols(),
471 end
= elffile
->end_dynamic_symbols();
474 if (it
->getType() == 2) { // Function
476 // TODO: Error handling
477 std::string symbolname
= *(elffile
->getSymbolName(it
));
478 std::string symbolversion
= *(elffile
->getSymbolVersion(nullptr, &*it
, is_default
));
479 // TODO: actually get the symbol address from relocations
480 Function
* f
= manager
->newDynamicFunction(0);
481 f
->setName(symbolname
+ (is_default
? "@@" : "@") + symbolversion
);
482 manager
->finishFunction(f
);
484 LOG4CXX_DEBUG(logger
, "Adding dynamic Symbol " << symbolname
<< (is_default
? "@@" : "@") << symbolversion
);
489 template <typename ELFT
>
490 void LLVMDisassembler
<ELFT
>::readSymbols() {
492 symbol_iterator
si(o
->symbol_begin()), se(o
->symbol_end());
493 for (; si
!= se
; ++si
) {
495 if ((ec
= si
->getName(name
))) {
496 LOG4CXX_ERROR(logger
, ec
.message());
499 LOG4CXX_DEBUG(logger
, "Added symbol " << name
.str());
500 symbols
.insert(make_pair(name
.str(), *si
));
504 template <typename ELFT
>
505 void LLVMDisassembler
<ELFT
>::readSections() {
507 section_iterator
i(o
->section_begin()), e(o
->section_end());
508 for (; i
!= e
; ++i
) {
510 if ((ec
= i
->getName(name
))) {
511 LOG4CXX_ERROR(logger
, ec
.message());
514 LOG4CXX_DEBUG(logger
, "Added section " << name
.str());
515 sections
.insert(make_pair(name
.str(), *i
));
520 // template <typename ELFT>
521 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
522 // // std::for_each(functions.begin(), functions.end(),
523 // // [&](std::pair<uint64_t, Function*> x) {
524 // // callback(x.first, x.second);
528 template <typename ELFT
>
529 std::list
<Instruction
> LLVMDisassembler
<ELFT
>::getInstructions(const BasicBlock
*block
) {
530 std::list
<Instruction
> result
;
531 SectionRef text_section
= getTextSection();
532 uint64_t base_address
;
533 text_section
.getAddress(base_address
);
534 uint64_t current_address
= block
->getStartAddress() - base_address
;
535 uint64_t end_position
= block
->getEndAddress() - base_address
;
538 text_section
.getContents(bytes
);
539 StringRefMemoryObject
ref(bytes
);
541 while (current_address
< end_position
) {
545 llvm::raw_string_ostream
s(buf
);
547 if(llvm::MCDisassembler::Success
==
548 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
550 uint8_t bytes
[inst_size
+2];
551 ref
.readBytes(current_address
, inst_size
, bytes
);
555 IP
->printInst(&inst
, s
, "");
556 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
557 std::stringstream stream
;
558 if (MIA
->isCall(inst
))
559 stream
<< "function:";
563 stream
<< std::hex
<< (base_address
+ jmptarget
);
566 result
.push_back(Instruction(current_address
+ base_address
, s
.str(),
567 std::vector
<uint8_t>(bytes
, bytes
+inst_size
), ref
));
569 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
571 ref
.readBytes(current_address
, 1, bytes
);
572 result
.push_back(Instruction(current_address
+ base_address
, "Invalid Instruction",
573 std::vector
<uint8_t>(bytes
, bytes
+1), ""));
577 current_address
+= inst_size
;
582 template <typename ELFT
>
583 void LLVMDisassembler
<ELFT
>::printEachInstruction(uint64_t start
, uint64_t end
,
584 std::function
<void (uint8_t*, size_t,
586 const std::string
&)> fun
) {
587 SectionRef text_section
= getTextSection();
588 uint64_t base_address
;
589 text_section
.getAddress(base_address
);
590 uint64_t current_address
= start
- base_address
;
593 text_section
.getContents(bytes
);
594 StringRefMemoryObject
ref(bytes
);
596 while (current_address
< end
- base_address
) {
600 llvm::raw_string_ostream
s(buf
);
602 if(llvm::MCDisassembler::Success
==
603 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
605 uint8_t bytes
[inst_size
+2];
606 ref
.readBytes(current_address
, inst_size
, bytes
);
610 IP
->printInst(&inst
, s
, "");
611 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
612 std::stringstream stream
;
613 if (MIA
->isCall(inst
))
614 stream
<< "function:";
618 stream
<< std::hex
<< (base_address
+ jmptarget
);
623 fun(bytes
, inst_size
, s
.str(), ref
);
625 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
626 fun(NULL
, 0, "Invalid Byte", "");
630 current_address
+= inst_size
;
634 template <typename ELFT
>
635 SectionRef LLVMDisassembler
<ELFT
>::getTextSection() {
636 return sections
[".text"];
640 SectionRef LLVMDisassembler
<MACHOT
>::getTextSection() {
641 return sections
["__text"];