1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
6 #include "core/Exception.hxx"
7 #include <boost/algorithm/string.hpp>
14 using namespace llvm::object
;
15 using std::error_code
;
30 Disassembler
* createLLVMDisassembler(const std::string
& filename
, InformationManager
* manager
) {
31 log4cxx::LoggerPtr
logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"));
35 auto retval
= createBinary(filename
);
36 if (error_code ec
= retval
.getError()) {
37 LOG4CXX_ERROR(logger
, ec
.message());
41 Binary
* op
= retval
.get();
42 #elif defined(LLVM_36)
43 OwningBinary
<Binary
> ob
;
44 ob
= std::move(retval
.get());
45 Binary
* op
= ob
.getBinary();
46 auto foo
= ob
.takeBinary();
52 // ELFType<endian, maxalign, 64bit>
53 if (ELF32LEObjectFile
* object
= dyn_cast
<ELF32LEObjectFile
>(op
)) {
54 return new LLVMDisassembler
<ELFType
<support::little
, 2, false>>(filename
, manager
, object
);
56 if (ELF64LEObjectFile
* object
= dyn_cast
<ELF64LEObjectFile
>(op
)) {
57 return new LLVMDisassembler
<ELFType
<support::little
, 2, true>>(filename
, manager
, object
);
59 if (ELF32BEObjectFile
* object
= dyn_cast
<ELF32BEObjectFile
>(op
)) {
60 return new LLVMDisassembler
<ELFType
<support::big
, 2, false>>(filename
, manager
, object
);
62 if (ELF64BEObjectFile
* object
= dyn_cast
<ELF64BEObjectFile
>(op
)) {
63 return new LLVMDisassembler
<ELFType
<support::big
, 2, true>>(filename
, manager
, object
);
65 if (COFFObjectFile
* object
= dyn_cast
<COFFObjectFile
>(op
)) {
66 return new LLVMDisassembler
<COFFT
>(filename
, manager
, object
);
68 if (MachOObjectFile
* object
= dyn_cast
<MachOObjectFile
>(op
)) {
69 return new LLVMDisassembler
<MACHOT
>(filename
, manager
, object
);
71 } catch (BinaryNotSupported
& e
) {
78 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
79 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
82 template <typename ELFT
>
83 LLVMDisassembler
<ELFT
>::LLVMDisassembler(const std::string
& filename
,
84 InformationManager
* manager
,
87 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
88 , triple("unknown-unknown-unknown")
91 LOG4CXX_DEBUG(logger
, "Handling file " << filename
);
94 auto result
= createBinary(filename
);
97 if ((ec
= result
.getError())) {
98 LOG4CXX_ERROR(logger
, "Failed to load Binary" << ec
.message());
104 binary
.reset(result
.get());
105 #elif defined(LLVM_36)
106 OwningBinary
<Binary
> ob
;
107 ob
= std::move(result
.get());
108 Binary
* op
= ob
.getBinary();
113 o
= dyn_cast
<ObjectFile
>(binary
.get());
119 triple
.setArch(Triple::ArchType(o
->getArch()));
120 std::string
tripleName(triple
.getTriple());
122 LOG4CXX_INFO(logger
, "Architecture " << tripleName
);
126 target
= TargetRegistry::lookupTarget("", triple
, es
);
128 LOG4CXX_ERROR(logger
, es
);
129 BinaryNotSupported e
;
133 LOG4CXX_INFO(logger
, "Target " << target
->getName());
135 MRI
.reset(target
->createMCRegInfo(tripleName
));
137 LOG4CXX_ERROR(logger
, "no register info for target " << tripleName
);
138 BinaryNotSupported e
;
142 // Set up disassembler.
143 AsmInfo
.reset(target
->createMCAsmInfo(*MRI
, tripleName
));
145 LOG4CXX_ERROR(logger
, "no assembly info for target " << tripleName
);
146 BinaryNotSupported e
;
150 STI
.reset(target
->createMCSubtargetInfo(tripleName
, "", ""));
152 LOG4CXX_ERROR(logger
, "no subtarget info for target " << tripleName
);
153 BinaryNotSupported e
;
157 MII
.reset(target
->createMCInstrInfo());
159 LOG4CXX_ERROR(logger
, "no instruction info for target " << tripleName
);
160 BinaryNotSupported e
;
164 MOFI
.reset(new MCObjectFileInfo
);
165 MCContext
Ctx(AsmInfo
.get(), MRI
.get(), MOFI
.get());
167 DisAsm
.reset(target
->createMCDisassembler(*STI
, Ctx
));
169 LOG4CXX_ERROR(logger
, "no disassembler for target " << tripleName
);
170 BinaryNotSupported e
;
174 target
->createMCRelocationInfo(tripleName
, Ctx
));
177 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
179 // DisAsm->setSymbolizer(std::move(Symzer));
184 MIA
.reset(target
->createMCInstrAnalysis(MII
.get()));
186 LOG4CXX_ERROR(logger
, "no instruction analysis for target " << tripleName
);
187 BinaryNotSupported e
;
191 int AsmPrinterVariant
= AsmInfo
->getAssemblerDialect();
192 IP
.reset(target
->createMCInstPrinter(AsmPrinterVariant
, *AsmInfo
, *MII
, *MRI
, *STI
));
194 LOG4CXX_ERROR(logger
, "no instruction printer for target " << tripleName
);
195 BinaryNotSupported e
;
199 IP
->setPrintImmHex(llvm::HexStyle::C
);
200 IP
->setPrintImmHex(true);
202 // std::unique_ptr<MCObjectDisassembler> OD(
203 // new MCObjectDisassembler(*o, *DisAsm, *MIA));
204 //Mod.reset(OD->buildModule(false));
209 template <typename ELFT
>
210 void LLVMDisassembler
<ELFT
>::start() {
213 readDynamicSymbols();
216 template <typename ELFT
>
217 LLVMDisassembler
<ELFT
>::~LLVMDisassembler() {}
219 template <typename ELFT
>
220 Function
* LLVMDisassembler
<ELFT
>::disassembleFunctionAt(uint64_t address
, const std::string
& name
) {
222 SectionRef text_section
= getTextSection();
223 uint64_t base_address
, size
;
225 text_section
.getAddress(base_address
);
226 text_section
.getSize(size
);
227 #elif defined(LLVM_36)
228 base_address
= text_section
.getAddress();
229 size
= text_section
.getSize();
231 if (address
< base_address
||
232 address
>= base_address
+ size
) {
236 if (NULL
== (function
= manager
->getFunction(address
))) {
240 s
<< "<Unnamed 0x" << std::hex
<< address
<< ">";
241 function
= manager
->newFunction(address
);
242 function
->setName(s
.str());
244 function
= manager
->newFunction(address
);
245 function
->setName(name
);
247 disassembleFunction(function
);
253 template <typename ELFT
>
254 void LLVMDisassembler
<ELFT
>::disassembleFunction(Function
* function
) {
255 std::vector
<uint64_t> called_functions
;
256 std::stack
<BasicBlock
*> remaining_blocks
;
258 * Do all blocks get added properly? We should take care to remove
259 * the other ones at the end of the function!
261 std::map
<uint64_t, BasicBlock
*> new_blocks
;
262 SectionRef text_section
= getTextSection();
264 text_section
.getContents(bytes
);
266 StringRefMemoryObject
ref(bytes
);
267 #elif defined(LLVM_36)
268 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
271 #error LLVM != 3.5 | 3.6 not supported
274 LOG4CXX_DEBUG(logger
, "Handling function " << function
->getName());
276 BasicBlock
* block
= manager
->newBasicBlock(function
->getStartAddress());
277 remaining_blocks
.push(block
);
278 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
279 function
->addBasicBlock(block
);
281 uint64_t base_address
, size
;
283 text_section
.getAddress(base_address
);
284 text_section
.getSize(size
);
285 #elif defined(LLVM_36)
286 base_address
= text_section
.getAddress();
287 size
= text_section
.getSize();
289 LOG4CXX_DEBUG(logger
, "Text section at " << std::hex
<< base_address
<< " with size " << size
);
291 while (remaining_blocks
.size()) {
292 BasicBlock
* current_block
= remaining_blocks
.top();
293 remaining_blocks
.pop();
295 LOG4CXX_DEBUG(logger
, "Handling Block starting at " << std::hex
296 << current_block
->getStartAddress());
299 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
303 llvm::raw_string_ostream
s(buf
);
305 if(llvm::MCDisassembler::Success
==
307 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
308 #elif defined(LLVM_36)
309 DisAsm
->getInstruction(inst
, inst_size
,
310 bytearray
.slice(current_address
),
311 base_address
+ current_address
,
316 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
317 jmptarget
+= base_address
;
318 if (!MIA
->isIndirectBranch(inst
)) {
319 if (MIA
->isCall(inst
)) {
320 if (NULL
== manager
->getFunction(jmptarget
))
321 called_functions
.push_back(jmptarget
);
323 current_block
->setNextBlock(0, jmptarget
);
324 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
325 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
327 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
328 function
->addBasicBlock(block
);
329 remaining_blocks
.push(block
);
331 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
332 << current_block
->getStartAddress());
333 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
335 if (MIA
->isConditionalBranch(inst
)) {
336 jmptarget
= base_address
+ current_address
+ inst_size
;
337 current_block
->setNextBlock(1, jmptarget
);
338 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
339 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
341 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
342 function
->addBasicBlock(block
);
343 remaining_blocks
.push(block
);
345 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
346 << current_block
->getStartAddress());
347 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
358 if (inst_size
== 0 || MIA
->isTerminator(inst
) || MIA
->isBranch(inst
)) {
359 current_block
->setEndAddress(current_address
+ base_address
+ inst_size
);
360 LOG4CXX_DEBUG(logger
, "Finished Block at " << std::hex
<<
361 current_block
->getEndAddress());
364 current_address
+= inst_size
;
367 splitBlocks(function
);
368 LOG4CXX_DEBUG(logger
, "Finished function " << function
->getName());
369 manager
->finishFunction(function
);
370 for (uint64_t address
: called_functions
)
371 disassembleFunctionAt(address
);
374 template <typename ELFT
>
375 void LLVMDisassembler
<ELFT
>::disassemble() {
376 SectionRef text_section
= getTextSection();
377 std::vector
<Function
*> remaining_functions
;
379 // Assume all function symbols actually start a real function
380 for (auto x
= symbols
.begin(); x
!= symbols
.end(); ++x
) {
383 SymbolRef::Type symbol_type
;
386 if (text_section
.containsSymbol(x
->second
, contains
) || !contains
)
387 #elif defined(LLVM_36)
388 if (!text_section
.containsSymbol(x
->second
))
392 if (x
->second
.getType(symbol_type
)
393 || SymbolRef::ST_Function
!= symbol_type
)
396 if (!x
->second
.getAddress(result
)) {
397 Function
* fun
= manager
->newFunction(result
);
399 fun
->setName(x
->first
);
400 remaining_functions
.push_back(fun
);
401 LOG4CXX_DEBUG(logger
, "Disasembling " << x
->first
);
403 LOG4CXX_DEBUG(logger
, "Function at " << std::hex
<< result
404 << " already disassembled as " << manager
->getFunction(result
)->getName());
409 for (Function
* function
: remaining_functions
) {
410 disassembleFunction(function
);
411 manager
->finishFunction(function
);
414 if (binary
->isELF()) {
415 uint64_t _entryAddress
= entryAddress();
416 LOG4CXX_DEBUG(logger
, "Adding entryAddress at: " << std::hex
<< _entryAddress
);
418 s
<< "<_start 0x" << std::hex
<< _entryAddress
<< ">";
420 disassembleFunctionAt(_entryAddress
, s
.str());
423 if (!manager
->hasFunctions()) {
426 text_section
.getAddress(text_entry
);
427 #elif defined(LLVM_36)
428 text_entry
= text_section
.getAddress();
430 LOG4CXX_INFO(logger
, "No Symbols found, starting at the beginning of the text segment");
431 disassembleFunctionAt(text_entry
);
436 uint64_t LLVMDisassembler
<COFFT
>::entryAddress() {
437 const auto coffobject
= dyn_cast
<COFFObjectFile
>(o
);
438 const struct pe32_header
* pe32_header
;
439 const struct pe32plus_header
* pe32plus_header
;
441 coffobject
->getPE32PlusHeader(pe32plus_header
);
443 if (pe32plus_header
) {
444 return pe32plus_header
->AddressOfEntryPoint
;
446 coffobject
->getPE32Header(pe32_header
);
447 return pe32_header
->AddressOfEntryPoint
;
452 uint64_t LLVMDisassembler
<MACHOT
>::entryAddress() {
457 template <typename ELFT
>
458 uint64_t LLVMDisassembler
<ELFT
>::entryAddress() {
459 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
460 const auto * header
= elffile
->getHeader();
462 return header
->e_entry
;
465 template <typename ELFT
>
466 void LLVMDisassembler
<ELFT
>::splitBlocks(Function
* function
) {
467 SectionRef text_section
= getTextSection();
469 text_section
.getContents(bytes
);
471 StringRefMemoryObject
ref(bytes
);
472 #elif defined(LLVM_36)
473 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
478 LOG4CXX_DEBUG(logger
, "Splitting Blocks in Function " << function
->getName());
479 // Split blocks where jumps are going inside the block
480 for (auto it
= function
->blocks().begin();
481 it
!= function
->blocks().end();
483 BasicBlock
* current_block
= it
->second
;
484 if (current_block
->getEndAddress() == 0) {
485 LOG4CXX_ERROR(logger
, "UNFINISHED BLOCK " << std::hex
<< current_block
->getStartAddress());
489 uint64_t base_address
;
491 text_section
.getAddress(base_address
);
492 #elif defined(LLVM_36)
493 base_address
= text_section
.getAddress();
495 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
496 while(current_block
->getEndAddress() - base_address
> current_address
) {
499 llvm::raw_string_ostream
s(buf
);
501 if(llvm::MCDisassembler::Success
==
503 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
504 #elif defined(LLVM_36)
505 DisAsm
->getInstruction(inst
, inst_size
,
506 bytearray
.slice(current_address
),
507 base_address
+ current_address
,
511 // See if some other block starts here
512 BasicBlock
* other
= manager
->getBasicBlock(current_address
516 // Special case, other block starts here but we are at the end anyway
518 uint64_t endaddress
= current_address
+ inst_size
+ base_address
;
519 if (endaddress
!= current_block
->getEndAddress()) {
520 LOG4CXX_DEBUG(logger
, "Shortening block starting at "
522 << current_block
->getStartAddress()
524 << other
->getStartAddress());
525 function
->addBasicBlock(other
);
526 current_block
->setEndAddress(endaddress
);
527 current_block
->setNextBlock(0, other
->getStartAddress());
528 current_block
->setNextBlock(1, 0);
534 current_address
+= inst_size
;
540 void LLVMDisassembler
<COFFT
>::readDynamicSymbols() {
545 void LLVMDisassembler
<MACHOT
>::readDynamicSymbols() {
549 template <typename ELFT
>
550 void LLVMDisassembler
<ELFT
>::readDynamicSymbols() {
551 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
552 for (auto it
= elffile
->begin_dynamic_symbols(),
553 end
= elffile
->end_dynamic_symbols();
556 if (it
->getType() == 2) { // Function
558 // TODO: Error handling
559 std::string symbolname
= *(elffile
->getSymbolName(it
));
560 std::string symbolversion
= *(elffile
->getSymbolVersion(nullptr, &*it
, is_default
));
561 // TODO: actually get the symbol address from relocations
562 Function
* f
= manager
->newDynamicFunction(0);
563 f
->setName(symbolname
+ (is_default
? "@@" : "@") + symbolversion
);
564 manager
->finishFunction(f
);
566 LOG4CXX_DEBUG(logger
, "Adding dynamic Symbol " << symbolname
<< (is_default
? "@@" : "@") << symbolversion
);
571 template <typename ELFT
>
572 void LLVMDisassembler
<ELFT
>::readSymbols() {
574 symbol_iterator
si(o
->symbol_begin()), se(o
->symbol_end());
575 for (; si
!= se
; ++si
) {
577 if ((ec
= si
->getName(name
))) {
578 LOG4CXX_ERROR(logger
, ec
.message());
581 LOG4CXX_DEBUG(logger
, "Added symbol " << name
.str());
582 symbols
.insert(make_pair(name
.str(), *si
));
586 template <typename ELFT
>
587 void LLVMDisassembler
<ELFT
>::readSections() {
589 section_iterator
i(o
->section_begin()), e(o
->section_end());
590 for (; i
!= e
; ++i
) {
592 if ((ec
= i
->getName(name
))) {
593 LOG4CXX_ERROR(logger
, ec
.message());
596 LOG4CXX_DEBUG(logger
, "Added section " << name
.str());
597 sections
.insert(make_pair(name
.str(), *i
));
602 // template <typename ELFT>
603 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
604 // // std::for_each(functions.begin(), functions.end(),
605 // // [&](std::pair<uint64_t, Function*> x) {
606 // // callback(x.first, x.second);
610 template <typename ELFT
>
611 std::vector
<Instruction
> LLVMDisassembler
<ELFT
>::getInstructions(const BasicBlock
*block
) {
612 std::vector
<Instruction
> result
;
613 SectionRef text_section
= getTextSection();
614 uint64_t base_address
;
616 text_section
.getAddress(base_address
);
617 #elif defined(LLVM_36)
618 base_address
= text_section
.getAddress();
621 uint64_t current_address
= block
->getStartAddress() - base_address
;
622 uint64_t end_position
= block
->getEndAddress() - base_address
;
625 text_section
.getContents(bytes
);
627 StringRefMemoryObject
ref(bytes
);
628 #elif defined(LLVM_36)
629 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
634 while (current_address
< end_position
) {
638 llvm::raw_string_ostream
s(buf
);
640 if(llvm::MCDisassembler::Success
==
642 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
643 #elif defined(LLVM_36)
644 DisAsm
->getInstruction(inst
, inst_size
,
645 bytearray
.slice(current_address
),
646 base_address
+ current_address
,
650 uint8_t bytes
[inst_size
+2];
652 ref
.readBytes(current_address
, inst_size
, bytes
);
653 #elif defined(LLVM_36)
654 size_t bytesindex(0);
655 for (uint8_t byte
: bytearray
.slice(current_address
, inst_size
)) {
656 bytes
[bytesindex
++] = byte
;
662 IP
->printInst(&inst
, s
, "");
663 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
664 std::stringstream stream
;
665 if (MIA
->isCall(inst
))
666 stream
<< "function:";
670 stream
<< std::hex
<< (base_address
+ jmptarget
);
673 result
.push_back(Instruction(current_address
+ base_address
, boost::algorithm::trim_copy(s
.str()),
674 std::vector
<uint8_t>(bytes
, bytes
+inst_size
), ref
));
676 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
679 ref
.readBytes(current_address
, 1, bytes
);
680 #elif defined(LLVM_36)
681 bytes
[0] = bytearray
[current_address
];
683 result
.push_back(Instruction(current_address
+ base_address
, "Invalid Instruction",
684 std::vector
<uint8_t>(bytes
, bytes
+1), ""));
688 current_address
+= inst_size
;
693 template <typename ELFT
>
694 void LLVMDisassembler
<ELFT
>::printEachInstruction(uint64_t start
, uint64_t end
,
695 std::function
<void (uint8_t*, size_t,
697 const std::string
&)> fun
) {
698 SectionRef text_section
= getTextSection();
699 uint64_t base_address
;
701 text_section
.getAddress(base_address
);
702 #elif defined(LLVM_36)
703 base_address
= text_section
.getAddress();
706 uint64_t current_address
= start
- base_address
;
709 text_section
.getContents(bytes
);
711 StringRefMemoryObject
ref(bytes
);
712 #elif defined(LLVM_36)
713 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
718 while (current_address
< end
- base_address
) {
722 llvm::raw_string_ostream
s(buf
);
724 if(llvm::MCDisassembler::Success
==
726 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
727 #elif defined(LLVM_36)
728 DisAsm
->getInstruction(inst
, inst_size
,
729 bytearray
.slice(current_address
),
730 base_address
+ current_address
,
734 uint8_t bytes
[inst_size
+2];
736 ref
.readBytes(current_address
, inst_size
, bytes
);
737 #elif defined(LLVM_36)
738 size_t bytesindex(0);
739 for (uint8_t byte
: bytearray
.slice(current_address
, inst_size
)) {
740 bytes
[bytesindex
++] = byte
;
746 IP
->printInst(&inst
, s
, "");
747 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
748 std::stringstream stream
;
749 if (MIA
->isCall(inst
))
750 stream
<< "function:";
754 stream
<< std::hex
<< (base_address
+ jmptarget
);
759 fun(bytes
, inst_size
, s
.str(), ref
);
761 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
762 fun(NULL
, 0, "Invalid Byte", "");
766 current_address
+= inst_size
;
770 template <typename ELFT
>
771 SectionRef LLVMDisassembler
<ELFT
>::getTextSection() {
772 return sections
[".text"];
776 SectionRef LLVMDisassembler
<MACHOT
>::getTextSection() {
777 return sections
["__text"];