1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
6 #include <boost/algorithm/string.hpp>
13 using namespace llvm::object
;
14 using std::error_code
;
29 Disassembler
* createLLVMDisassembler(const std::string
& filename
, InformationManager
* manager
) {
30 log4cxx::LoggerPtr
logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"));
34 auto retval
= createBinary(filename
);
35 if (error_code ec
= retval
.getError()) {
36 LOG4CXX_ERROR(logger
, ec
.message());
40 Binary
* op
= retval
.get();
43 LOG4CXX_ERROR(logger
, "Could not open " << filename
);
47 // ELFType<endian, maxalign, 64bit>
48 if (ELF32LEObjectFile
* object
= dyn_cast
<ELF32LEObjectFile
>(op
)) {
49 return new LLVMDisassembler
<ELFType
<support::little
, 2, false>>(filename
, manager
, object
);
51 if (ELF64LEObjectFile
* object
= dyn_cast
<ELF64LEObjectFile
>(op
)) {
52 return new LLVMDisassembler
<ELFType
<support::little
, 2, true>>(filename
, manager
, object
);
54 if (ELF32BEObjectFile
* object
= dyn_cast
<ELF32BEObjectFile
>(op
)) {
55 return new LLVMDisassembler
<ELFType
<support::big
, 2, false>>(filename
, manager
, object
);
57 if (ELF64BEObjectFile
* object
= dyn_cast
<ELF64BEObjectFile
>(op
)) {
58 return new LLVMDisassembler
<ELFType
<support::big
, 2, true>>(filename
, manager
, object
);
60 if (COFFObjectFile
* object
= dyn_cast
<COFFObjectFile
>(op
)) {
61 return new LLVMDisassembler
<COFFT
>(filename
, manager
, object
);
63 if (MachOObjectFile
* object
= dyn_cast
<MachOObjectFile
>(op
)) {
64 return new LLVMDisassembler
<MACHOT
>(filename
, manager
, object
);
71 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
72 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
75 template <typename ELFT
>
76 LLVMDisassembler
<ELFT
>::LLVMDisassembler(const std::string
& filename
,
77 InformationManager
* manager
,
80 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
81 , triple("unknown-unknown-unknown")
84 LOG4CXX_DEBUG(logger
, "Handling file " << filename
);
87 auto result
= createBinary(filename
);
90 if ((ec
= result
.getError())) {
91 LOG4CXX_ERROR(logger
, "Failed to load Binary" << ec
.message());
96 binary
.reset(result
.get());
98 o
= dyn_cast
<ObjectFile
>(binary
.get());
104 triple
.setArch(Triple::ArchType(o
->getArch()));
105 std::string
tripleName(triple
.getTriple());
107 LOG4CXX_INFO(logger
, "Architecture " << tripleName
);
111 target
= TargetRegistry::lookupTarget("", triple
, es
);
113 LOG4CXX_ERROR(logger
, es
);
117 LOG4CXX_INFO(logger
, "Target " << target
->getName());
119 MRI
.reset(target
->createMCRegInfo(tripleName
));
121 LOG4CXX_ERROR(logger
, "no register info for target " << tripleName
);
125 // Set up disassembler.
126 AsmInfo
.reset(target
->createMCAsmInfo(*MRI
, tripleName
));
128 LOG4CXX_ERROR(logger
, "no assembly info for target " << tripleName
);
132 STI
.reset(target
->createMCSubtargetInfo(tripleName
, "", ""));
134 LOG4CXX_ERROR(logger
, "no subtarget info for target " << tripleName
);
138 MII
.reset(target
->createMCInstrInfo());
140 LOG4CXX_ERROR(logger
, "no instruction info for target " << tripleName
);
144 MOFI
.reset(new MCObjectFileInfo
);
145 MCContext
Ctx(AsmInfo
.get(), MRI
.get(), MOFI
.get());
147 DisAsm
.reset(target
->createMCDisassembler(*STI
, Ctx
));
149 LOG4CXX_ERROR(logger
, "no disassembler for target " << tripleName
);
153 target
->createMCRelocationInfo(tripleName
, Ctx
));
156 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
158 // DisAsm->setSymbolizer(std::move(Symzer));
163 MIA
.reset(target
->createMCInstrAnalysis(MII
.get()));
165 LOG4CXX_ERROR(logger
, "no instruction analysis for target " << tripleName
);
169 int AsmPrinterVariant
= AsmInfo
->getAssemblerDialect();
170 IP
.reset(target
->createMCInstPrinter(AsmPrinterVariant
, *AsmInfo
, *MII
, *MRI
, *STI
));
172 LOG4CXX_ERROR(logger
, "no instruction printer for target " << tripleName
);
176 IP
->setPrintImmHex(llvm::HexStyle::C
);
177 IP
->setPrintImmHex(true);
179 std::unique_ptr
<MCObjectDisassembler
> OD(
180 new MCObjectDisassembler(*o
, *DisAsm
, *MIA
));
181 //Mod.reset(OD->buildModule(false));
186 template <typename ELFT
>
187 void LLVMDisassembler
<ELFT
>::start() {
190 readDynamicSymbols();
193 template <typename ELFT
>
194 LLVMDisassembler
<ELFT
>::~LLVMDisassembler() {}
196 template <typename ELFT
>
197 Function
* LLVMDisassembler
<ELFT
>::disassembleFunctionAt(uint64_t address
, const std::string
& name
) {
199 SectionRef text_section
= getTextSection();
200 uint64_t base_address
, size
;
201 text_section
.getAddress(base_address
);
202 text_section
.getSize(size
);
204 if (address
< base_address
||
205 address
>= base_address
+ size
) {
209 if (NULL
== (function
= manager
->getFunction(address
))) {
213 s
<< "<Unnamed 0x" << std::hex
<< address
<< ">";
214 function
= manager
->newFunction(address
);
215 function
->setName(s
.str());
217 function
= manager
->newFunction(address
);
218 function
->setName(name
);
220 disassembleFunction(function
);
226 template <typename ELFT
>
227 void LLVMDisassembler
<ELFT
>::disassembleFunction(Function
* function
) {
228 std::vector
<uint64_t> called_functions
;
229 std::stack
<BasicBlock
*> remaining_blocks
;
231 * Do all blocks get added properly? We should take care to remove
232 * the other ones at the end of the function!
234 std::map
<uint64_t, BasicBlock
*> new_blocks
;
235 SectionRef text_section
= getTextSection();
237 text_section
.getContents(bytes
);
238 StringRefMemoryObject
ref(bytes
);
240 LOG4CXX_DEBUG(logger
, "Handling function " << function
->getName());
242 BasicBlock
* block
= manager
->newBasicBlock(function
->getStartAddress());
243 remaining_blocks
.push(block
);
244 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
245 function
->addBasicBlock(block
);
247 uint64_t base_address
, size
;
248 text_section
.getAddress(base_address
);
249 text_section
.getSize(size
);
250 LOG4CXX_DEBUG(logger
, "Text section at " << std::hex
<< base_address
<< " with size " << size
);
252 while (remaining_blocks
.size()) {
253 BasicBlock
* current_block
= remaining_blocks
.top();
254 remaining_blocks
.pop();
256 LOG4CXX_DEBUG(logger
, "Handling Block starting at " << std::hex
257 << current_block
->getStartAddress());
260 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
264 llvm::raw_string_ostream
s(buf
);
266 if(llvm::MCDisassembler::Success
==
267 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
270 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
271 jmptarget
+= base_address
;
272 if (!MIA
->isIndirectBranch(inst
)) {
273 if (MIA
->isCall(inst
)) {
274 if (NULL
== manager
->getFunction(jmptarget
))
275 called_functions
.push_back(jmptarget
);
277 current_block
->setNextBlock(0, jmptarget
);
278 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
279 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
281 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
282 function
->addBasicBlock(block
);
283 remaining_blocks
.push(block
);
285 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
286 << current_block
->getStartAddress());
287 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
289 if (MIA
->isConditionalBranch(inst
)) {
290 jmptarget
= base_address
+ current_address
+ inst_size
;
291 current_block
->setNextBlock(1, jmptarget
);
292 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
293 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
295 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
296 function
->addBasicBlock(block
);
297 remaining_blocks
.push(block
);
299 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
300 << current_block
->getStartAddress());
301 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
312 if (inst_size
== 0 || MIA
->isTerminator(inst
) || MIA
->isBranch(inst
)) {
313 current_block
->setEndAddress(current_address
+ base_address
+ inst_size
);
314 LOG4CXX_DEBUG(logger
, "Finished Block at " << std::hex
<<
315 current_block
->getEndAddress());
318 current_address
+= inst_size
;
321 splitBlocks(function
);
322 LOG4CXX_DEBUG(logger
, "Finished function " << function
->getName());
323 manager
->finishFunction(function
);
324 for (uint64_t address
: called_functions
)
325 disassembleFunctionAt(address
);
328 template <typename ELFT
>
329 void LLVMDisassembler
<ELFT
>::disassemble() {
330 SectionRef text_section
= getTextSection();
331 std::vector
<Function
*> remaining_functions
;
333 // Assume all function symbols actually start a real function
334 for (auto x
= symbols
.begin(); x
!= symbols
.end(); ++x
) {
337 SymbolRef::Type symbol_type
;
340 if (text_section
.containsSymbol(x
->second
, contains
) || !contains
)
343 if (x
->second
.getType(symbol_type
)
344 || SymbolRef::ST_Function
!= symbol_type
)
347 if (!x
->second
.getAddress(result
)) {
348 Function
* fun
= manager
->newFunction(result
);
350 fun
->setName(x
->first
);
351 remaining_functions
.push_back(fun
);
352 LOG4CXX_DEBUG(logger
, "Disasembling " << x
->first
);
354 LOG4CXX_DEBUG(logger
, "Function at " << std::hex
<< result
355 << " already disassembled as " << manager
->getFunction(result
)->getName());
360 for (Function
* function
: remaining_functions
) {
361 disassembleFunction(function
);
362 manager
->finishFunction(function
);
365 if (binary
->isELF()) {
366 uint64_t _entryAddress
= entryAddress();
367 LOG4CXX_DEBUG(logger
, "Adding entryAddress at: " << std::hex
<< _entryAddress
);
369 s
<< "<_start 0x" << std::hex
<< _entryAddress
<< ">";
371 disassembleFunctionAt(_entryAddress
, s
.str());
374 if (!manager
->hasFunctions()) {
376 text_section
.getAddress(text_entry
);
377 LOG4CXX_INFO(logger
, "No Symbols found, starting at the beginning of the text segment");
378 disassembleFunctionAt(text_entry
);
383 uint64_t LLVMDisassembler
<COFFT
>::entryAddress() {
384 const auto coffobject
= dyn_cast
<COFFObjectFile
>(o
);
385 const struct pe32_header
* pe32_header
;
386 const struct pe32plus_header
* pe32plus_header
;
388 coffobject
->getPE32PlusHeader(pe32plus_header
);
390 if (pe32plus_header
) {
391 return pe32plus_header
->AddressOfEntryPoint
;
393 coffobject
->getPE32Header(pe32_header
);
394 return pe32_header
->AddressOfEntryPoint
;
399 uint64_t LLVMDisassembler
<MACHOT
>::entryAddress() {
404 template <typename ELFT
>
405 uint64_t LLVMDisassembler
<ELFT
>::entryAddress() {
406 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
407 const auto * header
= elffile
->getHeader();
409 return header
->e_entry
;
412 template <typename ELFT
>
413 void LLVMDisassembler
<ELFT
>::splitBlocks(Function
* function
) {
414 SectionRef text_section
= getTextSection();
416 text_section
.getContents(bytes
);
417 StringRefMemoryObject
ref(bytes
);
419 LOG4CXX_DEBUG(logger
, "Splitting Blocks in Function " << function
->getName());
420 // Split blocks where jumps are going inside the block
421 for (auto it
= function
->blocks().begin();
422 it
!= function
->blocks().end();
424 BasicBlock
* current_block
= it
->second
;
425 if (current_block
->getEndAddress() == 0) {
426 LOG4CXX_ERROR(logger
, "UNFINISHED BLOCK " << std::hex
<< current_block
->getStartAddress());
430 uint64_t base_address
;
431 text_section
.getAddress(base_address
);
432 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
433 while(current_block
->getEndAddress() - base_address
> current_address
) {
436 llvm::raw_string_ostream
s(buf
);
438 if(llvm::MCDisassembler::Success
==
439 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
440 // See if some other block starts here
441 BasicBlock
* other
= manager
->getBasicBlock(current_address
445 // Special case, other block starts here but we are at the end anyway
447 uint64_t endaddress
= current_address
+ inst_size
+ base_address
;
448 if (endaddress
!= current_block
->getEndAddress()) {
449 LOG4CXX_DEBUG(logger
, "Shortening block starting at "
451 << current_block
->getStartAddress()
453 << other
->getStartAddress());
454 function
->addBasicBlock(other
);
455 current_block
->setEndAddress(endaddress
);
456 current_block
->setNextBlock(0, other
->getStartAddress());
457 current_block
->setNextBlock(1, 0);
463 current_address
+= inst_size
;
469 void LLVMDisassembler
<COFFT
>::readDynamicSymbols() {
474 void LLVMDisassembler
<MACHOT
>::readDynamicSymbols() {
478 template <typename ELFT
>
479 void LLVMDisassembler
<ELFT
>::readDynamicSymbols() {
480 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
481 for (auto it
= elffile
->begin_dynamic_symbols(),
482 end
= elffile
->end_dynamic_symbols();
485 if (it
->getType() == 2) { // Function
487 // TODO: Error handling
488 std::string symbolname
= *(elffile
->getSymbolName(it
));
489 std::string symbolversion
= *(elffile
->getSymbolVersion(nullptr, &*it
, is_default
));
490 // TODO: actually get the symbol address from relocations
491 Function
* f
= manager
->newDynamicFunction(0);
492 f
->setName(symbolname
+ (is_default
? "@@" : "@") + symbolversion
);
493 manager
->finishFunction(f
);
495 LOG4CXX_DEBUG(logger
, "Adding dynamic Symbol " << symbolname
<< (is_default
? "@@" : "@") << symbolversion
);
500 template <typename ELFT
>
501 void LLVMDisassembler
<ELFT
>::readSymbols() {
503 symbol_iterator
si(o
->symbol_begin()), se(o
->symbol_end());
504 for (; si
!= se
; ++si
) {
506 if ((ec
= si
->getName(name
))) {
507 LOG4CXX_ERROR(logger
, ec
.message());
510 LOG4CXX_DEBUG(logger
, "Added symbol " << name
.str());
511 symbols
.insert(make_pair(name
.str(), *si
));
515 template <typename ELFT
>
516 void LLVMDisassembler
<ELFT
>::readSections() {
518 section_iterator
i(o
->section_begin()), e(o
->section_end());
519 for (; i
!= e
; ++i
) {
521 if ((ec
= i
->getName(name
))) {
522 LOG4CXX_ERROR(logger
, ec
.message());
525 LOG4CXX_DEBUG(logger
, "Added section " << name
.str());
526 sections
.insert(make_pair(name
.str(), *i
));
531 // template <typename ELFT>
532 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
533 // // std::for_each(functions.begin(), functions.end(),
534 // // [&](std::pair<uint64_t, Function*> x) {
535 // // callback(x.first, x.second);
539 template <typename ELFT
>
540 std::vector
<Instruction
> LLVMDisassembler
<ELFT
>::getInstructions(const BasicBlock
*block
) {
541 std::vector
<Instruction
> result
;
542 SectionRef text_section
= getTextSection();
543 uint64_t base_address
;
544 text_section
.getAddress(base_address
);
545 uint64_t current_address
= block
->getStartAddress() - base_address
;
546 uint64_t end_position
= block
->getEndAddress() - base_address
;
549 text_section
.getContents(bytes
);
550 StringRefMemoryObject
ref(bytes
);
552 while (current_address
< end_position
) {
556 llvm::raw_string_ostream
s(buf
);
558 if(llvm::MCDisassembler::Success
==
559 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
561 uint8_t bytes
[inst_size
+2];
562 ref
.readBytes(current_address
, inst_size
, bytes
);
566 IP
->printInst(&inst
, s
, "");
567 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
568 std::stringstream stream
;
569 if (MIA
->isCall(inst
))
570 stream
<< "function:";
574 stream
<< std::hex
<< (base_address
+ jmptarget
);
577 result
.push_back(Instruction(current_address
+ base_address
, boost::algorithm::trim_copy(s
.str()),
578 std::vector
<uint8_t>(bytes
, bytes
+inst_size
), ref
));
580 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
582 ref
.readBytes(current_address
, 1, bytes
);
583 result
.push_back(Instruction(current_address
+ base_address
, "Invalid Instruction",
584 std::vector
<uint8_t>(bytes
, bytes
+1), ""));
588 current_address
+= inst_size
;
593 template <typename ELFT
>
594 void LLVMDisassembler
<ELFT
>::printEachInstruction(uint64_t start
, uint64_t end
,
595 std::function
<void (uint8_t*, size_t,
597 const std::string
&)> fun
) {
598 SectionRef text_section
= getTextSection();
599 uint64_t base_address
;
600 text_section
.getAddress(base_address
);
601 uint64_t current_address
= start
- base_address
;
604 text_section
.getContents(bytes
);
605 StringRefMemoryObject
ref(bytes
);
607 while (current_address
< end
- base_address
) {
611 llvm::raw_string_ostream
s(buf
);
613 if(llvm::MCDisassembler::Success
==
614 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
616 uint8_t bytes
[inst_size
+2];
617 ref
.readBytes(current_address
, inst_size
, bytes
);
621 IP
->printInst(&inst
, s
, "");
622 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
623 std::stringstream stream
;
624 if (MIA
->isCall(inst
))
625 stream
<< "function:";
629 stream
<< std::hex
<< (base_address
+ jmptarget
);
634 fun(bytes
, inst_size
, s
.str(), ref
);
636 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
637 fun(NULL
, 0, "Invalid Byte", "");
641 current_address
+= inst_size
;
645 template <typename ELFT
>
646 SectionRef LLVMDisassembler
<ELFT
>::getTextSection() {
647 return sections
[".text"];
651 SectionRef LLVMDisassembler
<MACHOT
>::getTextSection() {
652 return sections
["__text"];