1 #include "disassembler/llvm/LLVMDisassembler.hxx"
2 #include "core/InformationManager.hxx"
3 #include "core/Function.hxx"
4 #include "core/BasicBlock.hxx"
11 using namespace llvm::object
;
12 using std::error_code
;
27 Disassembler
* createLLVMDisassembler(const std::string
& filename
, InformationManager
* manager
) {
31 std::unique_ptr
<Binary
> o
;
32 o
.reset(createBinary(filename
).get());
33 Binary
* op
= o
.release();
35 // ELFType<endian, maxalign, 64bit>
36 if (ELF32LEObjectFile
* object
= dyn_cast
<ELF32LEObjectFile
>(op
)) {
37 return new LLVMDisassembler
<ELFType
<support::little
, 2, false>>(filename
, manager
, object
);
39 if (ELF64LEObjectFile
* object
= dyn_cast
<ELF64LEObjectFile
>(op
)) {
40 return new LLVMDisassembler
<ELFType
<support::little
, 2, true>>(filename
, manager
, object
);
42 if (ELF32BEObjectFile
* object
= dyn_cast
<ELF32BEObjectFile
>(op
)) {
43 return new LLVMDisassembler
<ELFType
<support::big
, 2, false>>(filename
, manager
, object
);
45 if (ELF64BEObjectFile
* object
= dyn_cast
<ELF64BEObjectFile
>(op
)) {
46 return new LLVMDisassembler
<ELFType
<support::big
, 2, true>>(filename
, manager
, object
);
48 if (COFFObjectFile
* object
= dyn_cast
<COFFObjectFile
>(op
)) {
49 return new LLVMDisassembler
<COFFT
>(filename
, manager
, object
);
51 if (MachOObjectFile
* object
= dyn_cast
<MachOObjectFile
>(op
)) {
52 return new LLVMDisassembler
<MACHOT
>(filename
, manager
, object
);
59 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
60 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
63 template <typename ELFT
>
64 LLVMDisassembler
<ELFT
>::LLVMDisassembler(const std::string
& filename
,
65 InformationManager
* manager
,
68 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
69 , triple("unknown-unknown-unknown")
72 LOG4CXX_DEBUG(logger
, "Handling file " << filename
);
75 auto result
= createBinary(filename
);
78 if ((ec
= result
.getError())) {
79 LOG4CXX_ERROR(logger
, "Failed to load Binary" << ec
.message());
84 binary
.reset(result
.get());
86 o
= dyn_cast
<ObjectFile
>(binary
.get());
92 triple
.setArch(Triple::ArchType(o
->getArch()));
93 std::string
tripleName(triple
.getTriple());
95 LOG4CXX_INFO(logger
, "Architecture " << tripleName
);
99 target
= TargetRegistry::lookupTarget("", triple
, es
);
101 LOG4CXX_ERROR(logger
, es
);
105 LOG4CXX_INFO(logger
, "Target " << target
->getName());
107 MRI
.reset(target
->createMCRegInfo(tripleName
));
109 LOG4CXX_ERROR(logger
, "no register info for target " << tripleName
);
113 // Set up disassembler.
114 AsmInfo
.reset(target
->createMCAsmInfo(*MRI
, tripleName
));
116 LOG4CXX_ERROR(logger
, "no assembly info for target " << tripleName
);
120 STI
.reset(target
->createMCSubtargetInfo(tripleName
, "", ""));
122 LOG4CXX_ERROR(logger
, "no subtarget info for target " << tripleName
);
126 MII
.reset(target
->createMCInstrInfo());
128 LOG4CXX_ERROR(logger
, "no instruction info for target " << tripleName
);
132 MOFI
.reset(new MCObjectFileInfo
);
133 MCContext
Ctx(AsmInfo
.get(), MRI
.get(), MOFI
.get());
135 DisAsm
.reset(target
->createMCDisassembler(*STI
, Ctx
));
137 LOG4CXX_ERROR(logger
, "no disassembler for target " << tripleName
);
141 target
->createMCRelocationInfo(tripleName
, Ctx
));
144 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
146 // DisAsm->setSymbolizer(std::move(Symzer));
151 MIA
.reset(target
->createMCInstrAnalysis(MII
.get()));
153 LOG4CXX_ERROR(logger
, "no instruction analysis for target " << tripleName
);
157 int AsmPrinterVariant
= AsmInfo
->getAssemblerDialect();
158 IP
.reset(target
->createMCInstPrinter(AsmPrinterVariant
, *AsmInfo
, *MII
, *MRI
, *STI
));
160 LOG4CXX_ERROR(logger
, "no instruction printer for target " << tripleName
);
164 IP
->setPrintImmHex(llvm::HexStyle::C
);
165 IP
->setPrintImmHex(true);
167 std::unique_ptr
<MCObjectDisassembler
> OD(
168 new MCObjectDisassembler(*o
, *DisAsm
, *MIA
));
169 Mod
.reset(OD
->buildModule(false));
174 template <typename ELFT
>
175 void LLVMDisassembler
<ELFT
>::start() {
178 readDynamicSymbols();
181 template <typename ELFT
>
182 LLVMDisassembler
<ELFT
>::~LLVMDisassembler() {}
184 template <typename ELFT
>
185 Function
* LLVMDisassembler
<ELFT
>::disassembleFunctionAt(uint64_t address
, const std::string
& name
) {
187 SectionRef text_section
= getTextSection();
188 uint64_t base_address
, size
;
189 text_section
.getAddress(base_address
);
190 text_section
.getSize(size
);
192 if (address
< base_address
||
193 address
>= base_address
+ size
) {
197 if (NULL
== (function
= manager
->getFunction(address
))) {
201 s
<< "<Unnamed 0x" << std::hex
<< address
<< ">";
202 function
= manager
->newFunction(address
);
203 function
->setName(s
.str());
205 function
= manager
->newFunction(address
);
206 function
->setName(name
);
208 disassembleFunction(function
);
214 template <typename ELFT
>
215 void LLVMDisassembler
<ELFT
>::disassembleFunction(Function
* function
) {
216 std::vector
<uint64_t> called_functions
;
217 std::stack
<BasicBlock
*> remaining_blocks
;
219 * Do all blocks get added properly? We should take care to remove
220 * the other ones at the end of the function!
222 std::map
<uint64_t, BasicBlock
*> new_blocks
;
223 SectionRef text_section
= getTextSection();
225 text_section
.getContents(bytes
);
226 StringRefMemoryObject
ref(bytes
);
228 LOG4CXX_DEBUG(logger
, "Handling function " << function
->getName());
230 BasicBlock
* block
= manager
->newBasicBlock(function
->getStartAddress());
231 remaining_blocks
.push(block
);
232 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
233 function
->addBasicBlock(block
);
235 uint64_t base_address
, size
;
236 text_section
.getAddress(base_address
);
237 text_section
.getSize(size
);
238 LOG4CXX_DEBUG(logger
, "Text section at " << std::hex
<< base_address
<< " with size " << size
);
240 while (remaining_blocks
.size()) {
241 BasicBlock
* current_block
= remaining_blocks
.top();
242 remaining_blocks
.pop();
244 LOG4CXX_DEBUG(logger
, "Handling Block starting at " << std::hex
245 << current_block
->getStartAddress());
248 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
252 llvm::raw_string_ostream
s(buf
);
254 if(llvm::MCDisassembler::Success
==
255 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
258 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
259 jmptarget
+= base_address
;
260 if (!MIA
->isIndirectBranch(inst
)) {
261 if (MIA
->isCall(inst
)) {
262 if (NULL
== manager
->getFunction(jmptarget
))
263 called_functions
.push_back(jmptarget
);
265 current_block
->setNextBlock(0, jmptarget
);
266 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
267 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
269 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
270 function
->addBasicBlock(block
);
271 remaining_blocks
.push(block
);
273 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
274 << current_block
->getStartAddress());
275 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
277 if (MIA
->isConditionalBranch(inst
)) {
278 jmptarget
= base_address
+ current_address
+ inst_size
;
279 current_block
->setNextBlock(1, jmptarget
);
280 if (new_blocks
.find(jmptarget
) == new_blocks
.end()) {
281 BasicBlock
* block
= manager
->newBasicBlock(jmptarget
);
283 new_blocks
.insert(std::make_pair(block
->getStartAddress(), block
));
284 function
->addBasicBlock(block
);
285 remaining_blocks
.push(block
);
287 LOG4CXX_DEBUG(logger
, "Reusing Block starting at " << std::hex
288 << current_block
->getStartAddress());
289 function
->addBasicBlock(new_blocks
.find(jmptarget
)->second
);
300 if (inst_size
== 0 || MIA
->isTerminator(inst
) || MIA
->isBranch(inst
)) {
301 current_block
->setEndAddress(current_address
+ base_address
+ inst_size
);
302 LOG4CXX_DEBUG(logger
, "Finished Block at " << std::hex
<<
303 current_block
->getEndAddress());
306 current_address
+= inst_size
;
309 splitBlocks(function
);
310 LOG4CXX_DEBUG(logger
, "Finished function " << function
->getName());
311 manager
->finishFunction(function
);
312 for (uint64_t address
: called_functions
)
313 disassembleFunctionAt(address
);
316 template <typename ELFT
>
317 void LLVMDisassembler
<ELFT
>::disassemble() {
318 SectionRef text_section
= getTextSection();
319 std::vector
<Function
*> remaining_functions
;
321 // Assume all function symbols actually start a real function
322 for (auto x
= symbols
.begin(); x
!= symbols
.end(); ++x
) {
325 SymbolRef::Type symbol_type
;
328 if (text_section
.containsSymbol(x
->second
, contains
) || !contains
)
331 if (x
->second
.getType(symbol_type
)
332 || SymbolRef::ST_Function
!= symbol_type
)
335 if (!x
->second
.getAddress(result
)) {
336 Function
* fun
= manager
->newFunction(result
);
338 fun
->setName(x
->first
);
339 remaining_functions
.push_back(fun
);
340 LOG4CXX_DEBUG(logger
, "Disasembling " << x
->first
);
342 LOG4CXX_DEBUG(logger
, "Function at " << std::hex
<< result
343 << " already disassembled as " << manager
->getFunction(result
)->getName());
348 for (Function
* function
: remaining_functions
) {
349 disassembleFunction(function
);
350 manager
->finishFunction(function
);
353 if (binary
->isELF()) {
354 uint64_t _entryAddress
= entryAddress();
355 LOG4CXX_DEBUG(logger
, "Adding entryAddress at: " << std::hex
<< _entryAddress
);
357 s
<< "<_start 0x" << std::hex
<< _entryAddress
<< ">";
359 disassembleFunctionAt(_entryAddress
, s
.str());
362 if (!manager
->hasFunctions()) {
364 text_section
.getAddress(text_entry
);
365 LOG4CXX_INFO(logger
, "No Symbols found, starting at the beginning of the text segment");
366 disassembleFunctionAt(text_entry
);
371 uint64_t LLVMDisassembler
<COFFT
>::entryAddress() {
372 const auto coffobject
= dyn_cast
<COFFObjectFile
>(o
);
373 const struct pe32_header
* pe32_header
;
374 const struct pe32plus_header
* pe32plus_header
;
376 coffobject
->getPE32PlusHeader(pe32plus_header
);
378 if (pe32plus_header
) {
379 return pe32plus_header
->AddressOfEntryPoint
;
381 coffobject
->getPE32Header(pe32_header
);
382 return pe32_header
->AddressOfEntryPoint
;
387 uint64_t LLVMDisassembler
<MACHOT
>::entryAddress() {
392 template <typename ELFT
>
393 uint64_t LLVMDisassembler
<ELFT
>::entryAddress() {
394 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
395 const auto * header
= elffile
->getHeader();
397 return header
->e_entry
;
400 template <typename ELFT
>
401 void LLVMDisassembler
<ELFT
>::splitBlocks(Function
* function
) {
402 SectionRef text_section
= getTextSection();
404 text_section
.getContents(bytes
);
405 StringRefMemoryObject
ref(bytes
);
407 LOG4CXX_DEBUG(logger
, "Splitting Blocks in Function " << function
->getName());
408 // Split blocks where jumps are going inside the block
409 for (auto it
= function
->blocks().begin();
410 it
!= function
->blocks().end();
412 BasicBlock
* current_block
= it
->second
;
413 if (current_block
->getEndAddress() == 0) {
414 LOG4CXX_ERROR(logger
, "UNFINISHED BLOCK " << std::hex
<< current_block
->getStartAddress());
418 uint64_t base_address
;
419 text_section
.getAddress(base_address
);
420 uint64_t current_address
= current_block
->getStartAddress() - base_address
;
421 while(current_block
->getEndAddress() - base_address
> current_address
) {
424 llvm::raw_string_ostream
s(buf
);
426 if(llvm::MCDisassembler::Success
==
427 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
428 // See if some other block starts here
429 BasicBlock
* other
= manager
->getBasicBlock(current_address
433 // Special case, other block starts here but we are at the end anyway
435 uint64_t endaddress
= current_address
+ inst_size
+ base_address
;
436 if (endaddress
!= current_block
->getEndAddress()) {
437 LOG4CXX_DEBUG(logger
, "Shortening block starting at "
439 << current_block
->getStartAddress()
441 << other
->getStartAddress());
442 function
->addBasicBlock(other
);
443 current_block
->setEndAddress(endaddress
);
444 current_block
->setNextBlock(0, other
->getStartAddress());
445 current_block
->setNextBlock(1, 0);
451 current_address
+= inst_size
;
457 void LLVMDisassembler
<COFFT
>::readDynamicSymbols() {
462 void LLVMDisassembler
<MACHOT
>::readDynamicSymbols() {
466 template <typename ELFT
>
467 void LLVMDisassembler
<ELFT
>::readDynamicSymbols() {
468 const auto elffile
= dyn_cast
<ELFObjectFile
<ELFT
>>(o
)->getELFFile();
469 for (auto it
= elffile
->begin_dynamic_symbols(),
470 end
= elffile
->end_dynamic_symbols();
473 if (it
->getType() == 2) { // Function
475 // TODO: Error handling
476 std::string symbolname
= *(elffile
->getSymbolName(it
));
477 std::string symbolversion
= *(elffile
->getSymbolVersion(nullptr, &*it
, is_default
));
478 // TODO: actually get the symbol address from relocations
479 Function
* f
= manager
->newDynamicFunction(0);
480 f
->setName(symbolname
+ (is_default
? "@@" : "@") + symbolversion
);
481 manager
->finishFunction(f
);
483 LOG4CXX_DEBUG(logger
, "Adding dynamic Symbol " << symbolname
<< (is_default
? "@@" : "@") << symbolversion
);
488 template <typename ELFT
>
489 void LLVMDisassembler
<ELFT
>::readSymbols() {
491 symbol_iterator
si(o
->symbol_begin()), se(o
->symbol_end());
492 for (; si
!= se
; ++si
) {
494 if ((ec
= si
->getName(name
))) {
495 LOG4CXX_ERROR(logger
, ec
.message());
498 LOG4CXX_DEBUG(logger
, "Added symbol " << name
.str());
499 symbols
.insert(make_pair(name
.str(), *si
));
503 template <typename ELFT
>
504 void LLVMDisassembler
<ELFT
>::readSections() {
506 section_iterator
i(o
->section_begin()), e(o
->section_end());
507 for (; i
!= e
; ++i
) {
509 if ((ec
= i
->getName(name
))) {
510 LOG4CXX_ERROR(logger
, ec
.message());
513 LOG4CXX_DEBUG(logger
, "Added section " << name
.str());
514 sections
.insert(make_pair(name
.str(), *i
));
519 // template <typename ELFT>
520 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
521 // // std::for_each(functions.begin(), functions.end(),
522 // // [&](std::pair<uint64_t, Function*> x) {
523 // // callback(x.first, x.second);
527 template <typename ELFT
>
528 void LLVMDisassembler
<ELFT
>::printEachInstruction(uint64_t start
, uint64_t end
,
529 std::function
<void (uint8_t*, size_t,
531 const std::string
&)> fun
) {
532 SectionRef text_section
= getTextSection();
533 uint64_t base_address
;
534 text_section
.getAddress(base_address
);
535 uint64_t current_address
= start
- base_address
;
538 text_section
.getContents(bytes
);
539 StringRefMemoryObject
ref(bytes
);
541 while (current_address
< end
- base_address
) {
545 llvm::raw_string_ostream
s(buf
);
547 if(llvm::MCDisassembler::Success
==
548 DisAsm
->getInstruction(inst
, inst_size
, ref
, current_address
, nulls(), nulls())) {
550 uint8_t bytes
[inst_size
+2];
551 ref
.readBytes(current_address
, inst_size
, bytes
);
555 IP
->printInst(&inst
, s
, "");
556 if (MIA
->evaluateBranch(inst
, current_address
, inst_size
, jmptarget
)) {
557 std::stringstream stream
;
558 if (MIA
->isCall(inst
))
559 stream
<< "function:";
563 stream
<< std::hex
<< (base_address
+ jmptarget
);
568 fun(bytes
, inst_size
, s
.str(), ref
);
570 LOG4CXX_WARN(logger
, "Invalid byte at" << std::hex
<< current_address
+ base_address
);
571 fun(NULL
, 0, "Invalid Byte", "");
575 current_address
+= inst_size
;
579 template <typename ELFT
>
580 SectionRef LLVMDisassembler
<ELFT
>::getTextSection() {
581 return sections
[".text"];
585 SectionRef LLVMDisassembler
<MACHOT
>::getTextSection() {
586 return sections
["__text"];