1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
6 #include <boost/algorithm/string.hpp>
13 using namespace llvm::object
;
14 using std::error_code
;
29 Disassembler
* createLLVMDisassembler(const std::string
& filename
, InformationManager
* manager
) {
30 log4cxx::LoggerPtr
logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"));
34 auto retval
= createBinary(filename
);
35 if (error_code ec
= retval
.getError()) {
36 LOG4CXX_ERROR(logger
, ec
.message());
40 Binary
* op
= retval
.get();
41 #elif defined(LLVM_36)
42 OwningBinary
<Binary
> ob
;
43 ob
= std::move(retval
.get());
44 Binary
* op
= ob
.getBinary();
45 auto foo
= ob
.takeBinary();
50 // ELFType<endian, maxalign, 64bit>
51 if (ELF32LEObjectFile
* object
= dyn_cast
<ELF32LEObjectFile
>(op
)) {
52 return new LLVMDisassembler
<ELFType
<support::little
, 2, false>>(filename
, manager
, object
);
54 if (ELF64LEObjectFile
* object
= dyn_cast
<ELF64LEObjectFile
>(op
)) {
55 return new LLVMDisassembler
<ELFType
<support::little
, 2, true>>(filename
, manager
, object
);
57 if (ELF32BEObjectFile
* object
= dyn_cast
<ELF32BEObjectFile
>(op
)) {
58 return new LLVMDisassembler
<ELFType
<support::big
, 2, false>>(filename
, manager
, object
);
60 if (ELF64BEObjectFile
* object
= dyn_cast
<ELF64BEObjectFile
>(op
)) {
61 return new LLVMDisassembler
<ELFType
<support::big
, 2, true>>(filename
, manager
, object
);
63 if (COFFObjectFile
* object
= dyn_cast
<COFFObjectFile
>(op
)) {
64 return new LLVMDisassembler
<COFFT
>(filename
, manager
, object
);
66 if (MachOObjectFile
* object
= dyn_cast
<MachOObjectFile
>(op
)) {
67 return new LLVMDisassembler
<MACHOT
>(filename
, manager
, object
);
74 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
75 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
78 template <typename ELFT
>
79 LLVMDisassembler
<ELFT
>::LLVMDisassembler(const std::string
& filename
,
80 InformationManager
* manager
,
83 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
84 , triple("unknown-unknown-unknown")
87 LOG4CXX_DEBUG(logger
, "Handling file " << filename
);
90 auto result
= createBinary(filename
);
93 if ((ec
= result
.getError())) {
94 LOG4CXX_ERROR(logger
, "Failed to load Binary" << ec
.message());
100 binary
.reset(result
.get());
101 #elif defined(LLVM_36)
102 OwningBinary
<Binary
> ob
;
103 ob
= std::move(result
.get());
104 Binary
* op
= ob
.getBinary();
109 o
= dyn_cast
<ObjectFile
>(binary
.get());
115 triple
.setArch(Triple::ArchType(o
->getArch()));
116 std::string
tripleName(triple
.getTriple());
118 LOG4CXX_INFO(logger
, "Architecture " << tripleName
);
122 target
= TargetRegistry::lookupTarget("", triple
, es
);
124 LOG4CXX_ERROR(logger
, es
);
128 LOG4CXX_INFO(logger
, "Target " << target
->getName());
130 MRI
.reset(target
->createMCRegInfo(tripleName
));
132 LOG4CXX_ERROR(logger
, "no register info for target " << tripleName
);
136 // Set up disassembler.
137 AsmInfo
.reset(target
->createMCAsmInfo(*MRI
, tripleName
));
139 LOG4CXX_ERROR(logger
, "no assembly info for target " << tripleName
);
143 STI
.reset(target
->createMCSubtargetInfo(tripleName
, "", ""));
145 LOG4CXX_ERROR(logger
, "no subtarget info for target " << tripleName
);
149 MII
.reset(target
->createMCInstrInfo());
151 LOG4CXX_ERROR(logger
, "no instruction info for target " << tripleName
);
155 MOFI
.reset(new MCObjectFileInfo
);
156 MCContext
Ctx(AsmInfo
.get(), MRI
.get(), MOFI
.get());
158 DisAsm
.reset(target
->createMCDisassembler(*STI
, Ctx
));
160 LOG4CXX_ERROR(logger
, "no disassembler for target " << tripleName
);
164 target
->createMCRelocationInfo(tripleName
, Ctx
));
167 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
169 // DisAsm->setSymbolizer(std::move(Symzer));
174 MIA
.reset(target
->createMCInstrAnalysis(MII
.get()));
176 LOG4CXX_ERROR(logger
, "no instruction analysis for target " << tripleName
);
180 int AsmPrinterVariant
= AsmInfo
->getAssemblerDialect();
181 IP
.reset(target
->createMCInstPrinter(AsmPrinterVariant
, *AsmInfo
, *MII
, *MRI
, *STI
));
183 LOG4CXX_ERROR(logger
, "no instruction printer for target " << tripleName
);
187 IP
->setPrintImmHex(llvm::HexStyle::C
);
188 IP
->setPrintImmHex(true);
190 // std::unique_ptr<MCObjectDisassembler> OD(
191 // new MCObjectDisassembler(*o, *DisAsm, *MIA));
192 //Mod.reset(OD->buildModule(false));
197 template <typename ELFT
>
198 void LLVMDisassembler
<ELFT
>::start() {
201 readDynamicSymbols();
204 template <typename ELFT
>
205 LLVMDisassembler
<ELFT
>::~LLVMDisassembler() {}
207 template <typename ELFT
>
208 Function
* LLVMDisassembler
<ELFT
>::disassembleFunctionAt(uint64_t address
, const std::string
& name
) {
210 SectionRef text_section
= getTextSection();
211 uint64_t base_address
, size
;
213 text_section
.getAddress(base_address
);
214 text_section
.getSize(size
);
215 #elif defined(LLVM_36)
216 base_address
= text_section
.getAddress();
217 size
= text_section
.getSize();
219 if (address
< base_address
||
220 address
>= base_address
+ size
) {
224 if (NULL
== (function
= manager
->getFunction(address
))) {
228 s
<< "<Unnamed 0x" << std::hex
<< address
<< ">";
229 function
= manager
->newFunction(address
);
230 function
->setName(s
.str());
232 function
= manager
->newFunction(address
);
233 function
->setName(name
);
235 disassembleFunction(function
);
241 template <typename ELFT
>
242 void LLVMDisassembler
<ELFT
>::disassembleFunction(Function
* function
) {
243 std::vector
<uint64_t> called_functions
;
244 std::stack
<BasicBlock
*> remaining_blocks
;
246 * Do all blocks get added properly? We should take care to remove
247 * the other ones at the end of the function!
249 std::map
<uint64_t, BasicBlock
*> new_blocks
;
250 SectionRef text_section
= getTextSection();
252 text_section
.getContents(bytes
);
254 StringRefMemoryObject
ref(bytes
);
255 #elif defined(LLVM_36)
256 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
259 #error LLVM != 3.5 | 3.6 not supported
262 LOG4CXX_DEBUG(logger
, "Handling function " << function
->getName());
264 BasicBlock
* block
= manager
->newBasicBlock(function
->getStartAddress());
265 remaining_blocks
.push(block
);
266 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
267 function
->addBasicBlock(block
);
269 uint64_t base_address
, size
;
271 text_section
.getAddress(base_address
);
272 text_section
.getSize(size
);
273 #elif defined(LLVM_36)
274 base_address
= text_section
.getAddress();
275 size
= text_section
.getSize();
277 LOG4CXX_DEBUG(logger
, "Text section at " << std::hex
<< base_address
<< " with size " << size
);
279 while (remaining_blocks
.size()) {
280 BasicBlock
* current_block
= remaining_blocks
.top();
281 remaining_blocks
.pop();
283 LOG4CXX_DEBUG(logger
, "Handling Block starting at " << std::hex
284 << current_block
->getStartAddress());
287 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
291 llvm::raw_string_ostream
s(buf
);
293 if(llvm::MCDisassembler::Success
==
295 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
296 #elif defined(LLVM_36)
297 DisAsm
->getInstruction(inst
, inst_size
,
298 bytearray
.slice(current_address
),
299 base_address
+ current_address
,
304 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
305 jmptarget
+= base_address
;
306 if (!MIA
->isIndirectBranch(inst
)) {
307 if (MIA
->isCall(inst
)) {
308 if (NULL
== manager
->getFunction(jmptarget
))
309 called_functions
.push_back(jmptarget
);
311 current_block
->setNextBlock(0, jmptarget
);
312 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
313 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
315 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
316 function
->addBasicBlock(block
);
317 remaining_blocks
.push(block
);
319 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
320 << current_block
->getStartAddress());
321 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
323 if (MIA
->isConditionalBranch(inst
)) {
324 jmptarget
= base_address
+ current_address
+ inst_size
;
325 current_block
->setNextBlock(1, jmptarget
);
326 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
327 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
329 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
330 function
->addBasicBlock(block
);
331 remaining_blocks
.push(block
);
333 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
334 << current_block
->getStartAddress());
335 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
346 if (inst_size
== 0 || MIA
->isTerminator(inst
) || MIA
->isBranch(inst
)) {
347 current_block
->setEndAddress(current_address
+ base_address
+ inst_size
);
348 LOG4CXX_DEBUG(logger
, "Finished Block at " << std::hex
<<
349 current_block
->getEndAddress());
352 current_address
+= inst_size
;
355 splitBlocks(function
);
356 LOG4CXX_DEBUG(logger
, "Finished function " << function
->getName());
357 manager
->finishFunction(function
);
358 for (uint64_t address
: called_functions
)
359 disassembleFunctionAt(address
);
362 template <typename ELFT
>
363 void LLVMDisassembler
<ELFT
>::disassemble() {
364 SectionRef text_section
= getTextSection();
365 std::vector
<Function
*> remaining_functions
;
367 // Assume all function symbols actually start a real function
368 for (auto x
= symbols
.begin(); x
!= symbols
.end(); ++x
) {
371 SymbolRef::Type symbol_type
;
374 if (text_section
.containsSymbol(x
->second
, contains
) || !contains
)
375 #elif defined(LLVM_36)
376 if (text_section
.containsSymbol(x
->second
))
380 if (x
->second
.getType(symbol_type
)
381 || SymbolRef::ST_Function
!= symbol_type
)
384 if (!x
->second
.getAddress(result
)) {
385 Function
* fun
= manager
->newFunction(result
);
387 fun
->setName(x
->first
);
388 remaining_functions
.push_back(fun
);
389 LOG4CXX_DEBUG(logger
, "Disasembling " << x
->first
);
391 LOG4CXX_DEBUG(logger
, "Function at " << std::hex
<< result
392 << " already disassembled as " << manager
->getFunction(result
)->getName());
397 for (Function
* function
: remaining_functions
) {
398 disassembleFunction(function
);
399 manager
->finishFunction(function
);
402 if (binary
->isELF()) {
403 uint64_t _entryAddress
= entryAddress();
404 LOG4CXX_DEBUG(logger
, "Adding entryAddress at: " << std::hex
<< _entryAddress
);
406 s
<< "<_start 0x" << std::hex
<< _entryAddress
<< ">";
408 disassembleFunctionAt(_entryAddress
, s
.str());
411 if (!manager
->hasFunctions()) {
414 text_section
.getAddress(text_entry
);
415 #elif defined(LLVM_36)
416 text_entry
= text_section
.getAddress();
418 LOG4CXX_INFO(logger
, "No Symbols found, starting at the beginning of the text segment");
419 disassembleFunctionAt(text_entry
);
424 uint64_t LLVMDisassembler
<COFFT
>::entryAddress() {
425 const auto coffobject
= dyn_cast
<COFFObjectFile
>(o
);
426 const struct pe32_header
* pe32_header
;
427 const struct pe32plus_header
* pe32plus_header
;
429 coffobject
->getPE32PlusHeader(pe32plus_header
);
431 if (pe32plus_header
) {
432 return pe32plus_header
->AddressOfEntryPoint
;
434 coffobject
->getPE32Header(pe32_header
);
435 return pe32_header
->AddressOfEntryPoint
;
440 uint64_t LLVMDisassembler
<MACHOT
>::entryAddress() {
445 template <typename ELFT
>
446 uint64_t LLVMDisassembler
<ELFT
>::entryAddress() {
447 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
448 const auto * header
= elffile
->getHeader();
450 return header
->e_entry
;
453 template <typename ELFT
>
454 void LLVMDisassembler
<ELFT
>::splitBlocks(Function
* function
) {
455 SectionRef text_section
= getTextSection();
457 text_section
.getContents(bytes
);
459 StringRefMemoryObject
ref(bytes
);
460 #elif defined(LLVM_36)
461 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
466 LOG4CXX_DEBUG(logger
, "Splitting Blocks in Function " << function
->getName());
467 // Split blocks where jumps are going inside the block
468 for (auto it
= function
->blocks().begin();
469 it
!= function
->blocks().end();
471 BasicBlock
* current_block
= it
->second
;
472 if (current_block
->getEndAddress() == 0) {
473 LOG4CXX_ERROR(logger
, "UNFINISHED BLOCK " << std::hex
<< current_block
->getStartAddress());
477 uint64_t base_address
;
479 text_section
.getAddress(base_address
);
480 #elif defined(LLVM_36)
481 base_address
= text_section
.getAddress();
483 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
484 while(current_block
->getEndAddress() - base_address
> current_address
) {
487 llvm::raw_string_ostream
s(buf
);
489 if(llvm::MCDisassembler::Success
==
491 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
492 #elif defined(LLVM_36)
493 DisAsm
->getInstruction(inst
, inst_size
,
494 bytearray
.slice(current_address
),
495 base_address
+ current_address
,
499 // See if some other block starts here
500 BasicBlock
* other
= manager
->getBasicBlock(current_address
504 // Special case, other block starts here but we are at the end anyway
506 uint64_t endaddress
= current_address
+ inst_size
+ base_address
;
507 if (endaddress
!= current_block
->getEndAddress()) {
508 LOG4CXX_DEBUG(logger
, "Shortening block starting at "
510 << current_block
->getStartAddress()
512 << other
->getStartAddress());
513 function
->addBasicBlock(other
);
514 current_block
->setEndAddress(endaddress
);
515 current_block
->setNextBlock(0, other
->getStartAddress());
516 current_block
->setNextBlock(1, 0);
522 current_address
+= inst_size
;
528 void LLVMDisassembler
<COFFT
>::readDynamicSymbols() {
533 void LLVMDisassembler
<MACHOT
>::readDynamicSymbols() {
537 template <typename ELFT
>
538 void LLVMDisassembler
<ELFT
>::readDynamicSymbols() {
539 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
540 for (auto it
= elffile
->begin_dynamic_symbols(),
541 end
= elffile
->end_dynamic_symbols();
544 if (it
->getType() == 2) { // Function
546 // TODO: Error handling
547 std::string symbolname
= *(elffile
->getSymbolName(it
));
548 std::string symbolversion
= *(elffile
->getSymbolVersion(nullptr, &*it
, is_default
));
549 // TODO: actually get the symbol address from relocations
550 Function
* f
= manager
->newDynamicFunction(0);
551 f
->setName(symbolname
+ (is_default
? "@@" : "@") + symbolversion
);
552 manager
->finishFunction(f
);
554 LOG4CXX_DEBUG(logger
, "Adding dynamic Symbol " << symbolname
<< (is_default
? "@@" : "@") << symbolversion
);
559 template <typename ELFT
>
560 void LLVMDisassembler
<ELFT
>::readSymbols() {
562 symbol_iterator
si(o
->symbol_begin()), se(o
->symbol_end());
563 for (; si
!= se
; ++si
) {
565 if ((ec
= si
->getName(name
))) {
566 LOG4CXX_ERROR(logger
, ec
.message());
569 LOG4CXX_DEBUG(logger
, "Added symbol " << name
.str());
570 symbols
.insert(make_pair(name
.str(), *si
));
574 template <typename ELFT
>
575 void LLVMDisassembler
<ELFT
>::readSections() {
577 section_iterator
i(o
->section_begin()), e(o
->section_end());
578 for (; i
!= e
; ++i
) {
580 if ((ec
= i
->getName(name
))) {
581 LOG4CXX_ERROR(logger
, ec
.message());
584 LOG4CXX_DEBUG(logger
, "Added section " << name
.str());
585 sections
.insert(make_pair(name
.str(), *i
));
590 // template <typename ELFT>
591 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
592 // // std::for_each(functions.begin(), functions.end(),
593 // // [&](std::pair<uint64_t, Function*> x) {
594 // // callback(x.first, x.second);
598 template <typename ELFT
>
599 std::vector
<Instruction
> LLVMDisassembler
<ELFT
>::getInstructions(const BasicBlock
*block
) {
600 std::vector
<Instruction
> result
;
601 SectionRef text_section
= getTextSection();
602 uint64_t base_address
;
604 text_section
.getAddress(base_address
);
605 #elif defined(LLVM_36)
606 base_address
= text_section
.getAddress();
609 uint64_t current_address
= block
->getStartAddress() - base_address
;
610 uint64_t end_position
= block
->getEndAddress() - base_address
;
613 text_section
.getContents(bytes
);
615 StringRefMemoryObject
ref(bytes
);
616 #elif defined(LLVM_36)
617 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
622 while (current_address
< end_position
) {
626 llvm::raw_string_ostream
s(buf
);
628 if(llvm::MCDisassembler::Success
==
630 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
631 #elif defined(LLVM_36)
632 DisAsm
->getInstruction(inst
, inst_size
,
633 bytearray
.slice(current_address
),
634 base_address
+ current_address
,
638 uint8_t bytes
[inst_size
+2];
640 ref
.readBytes(current_address
, inst_size
, bytes
);
641 #elif defined(LLVM_36)
642 size_t bytesindex(0);
643 for (uint8_t byte
: bytearray
.slice(current_address
, inst_size
)) {
644 bytes
[bytesindex
++] = byte
;
650 IP
->printInst(&inst
, s
, "");
651 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
652 std::stringstream stream
;
653 if (MIA
->isCall(inst
))
654 stream
<< "function:";
658 stream
<< std::hex
<< (base_address
+ jmptarget
);
661 result
.push_back(Instruction(current_address
+ base_address
, boost::algorithm::trim_copy(s
.str()),
662 std::vector
<uint8_t>(bytes
, bytes
+inst_size
), ref
));
664 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
667 ref
.readBytes(current_address
, 1, bytes
);
668 #elif defined(LLVM_36)
669 bytes
[0] = bytearray
[current_address
];
671 result
.push_back(Instruction(current_address
+ base_address
, "Invalid Instruction",
672 std::vector
<uint8_t>(bytes
, bytes
+1), ""));
676 current_address
+= inst_size
;
681 template <typename ELFT
>
682 void LLVMDisassembler
<ELFT
>::printEachInstruction(uint64_t start
, uint64_t end
,
683 std::function
<void (uint8_t*, size_t,
685 const std::string
&)> fun
) {
686 SectionRef text_section
= getTextSection();
687 uint64_t base_address
;
689 text_section
.getAddress(base_address
);
690 #elif defined(LLVM_36)
691 base_address
= text_section
.getAddress();
694 uint64_t current_address
= start
- base_address
;
697 text_section
.getContents(bytes
);
699 StringRefMemoryObject
ref(bytes
);
700 #elif defined(LLVM_36)
701 ArrayRef
<uint8_t> bytearray(reinterpret_cast<const uint8_t *>(bytes
.data()),
706 while (current_address
< end
- base_address
) {
710 llvm::raw_string_ostream
s(buf
);
712 if(llvm::MCDisassembler::Success
==
714 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
715 #elif defined(LLVM_36)
716 DisAsm
->getInstruction(inst
, inst_size
,
717 bytearray
.slice(current_address
),
718 base_address
+ current_address
,
722 uint8_t bytes
[inst_size
+2];
724 ref
.readBytes(current_address
, inst_size
, bytes
);
725 #elif defined(LLVM_36)
726 size_t bytesindex(0);
727 for (uint8_t byte
: bytearray
.slice(current_address
, inst_size
)) {
728 bytes
[bytesindex
++] = byte
;
734 IP
->printInst(&inst
, s
, "");
735 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
736 std::stringstream stream
;
737 if (MIA
->isCall(inst
))
738 stream
<< "function:";
742 stream
<< std::hex
<< (base_address
+ jmptarget
);
747 fun(bytes
, inst_size
, s
.str(), ref
);
749 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
750 fun(NULL
, 0, "Invalid Byte", "");
754 current_address
+= inst_size
;
758 template <typename ELFT
>
759 SectionRef LLVMDisassembler
<ELFT
>::getTextSection() {
760 return sections
[".text"];
764 SectionRef LLVMDisassembler
<MACHOT
>::getTextSection() {
765 return sections
["__text"];