]> git.siccegge.de Git - frida/frida.git/blob - src/disassembler/llvm/LLVMDisassembler.cxx
Remove whitespace around printed instruction
[frida/frida.git] / src / disassembler / llvm / LLVMDisassembler.cxx
1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
6 #include <boost/algorithm/string.hpp>
7
8 #include <stack>
9 #include <algorithm>
10 #include <cassert>
11
12 using namespace llvm;
13 using namespace llvm::object;
14 using std::error_code;
15
16 namespace {
17 class COFFT {
18
19 };
20
21 class MACHOT {
22
23 };
24 }
25
26 /*
27 *
28 */
29 Disassembler * createLLVMDisassembler(const std::string& filename, InformationManager* manager) {
30 if (filename == "")
31 return NULL;
32
33 std::unique_ptr<Binary> o;
34 o.reset(createBinary(filename).get());
35 Binary * op = o.release();
36
37 // ELFType<endian, maxalign, 64bit>
38 if (ELF32LEObjectFile * object = dyn_cast<ELF32LEObjectFile>(op)) {
39 return new LLVMDisassembler<ELFType<support::little, 2, false>>(filename, manager, object);
40 }
41 if (ELF64LEObjectFile * object = dyn_cast<ELF64LEObjectFile>(op)) {
42 return new LLVMDisassembler<ELFType<support::little, 2, true>>(filename, manager, object);
43 }
44 if (ELF32BEObjectFile * object = dyn_cast<ELF32BEObjectFile>(op)) {
45 return new LLVMDisassembler<ELFType<support::big, 2, false>>(filename, manager, object);
46 }
47 if (ELF64BEObjectFile * object = dyn_cast<ELF64BEObjectFile>(op)) {
48 return new LLVMDisassembler<ELFType<support::big, 2, true>>(filename, manager, object);
49 }
50 if (COFFObjectFile * object = dyn_cast<COFFObjectFile>(op)) {
51 return new LLVMDisassembler<COFFT>(filename, manager, object);
52 }
53 if (MachOObjectFile * object = dyn_cast<MachOObjectFile>(op)) {
54 return new LLVMDisassembler<MACHOT>(filename, manager, object);
55 }
56
57 return NULL;
58 }
59
60 /*
61 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
62 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
63 * foo
64 */
65 template <typename ELFT>
66 LLVMDisassembler<ELFT>::LLVMDisassembler(const std::string& filename,
67 InformationManager* manager,
68 ObjectFile* file)
69 : Disassembler()
70 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
71 , triple("unknown-unknown-unknown")
72 , manager(manager)
73 {
74 LOG4CXX_DEBUG(logger, "Handling file " << filename);
75
76 if (!file) {
77 auto result = createBinary(filename);
78
79 error_code ec;
80 if ((ec = result.getError())) {
81 LOG4CXX_ERROR(logger, "Failed to load Binary" << ec.message());
82 binary = NULL;
83 return;
84 }
85
86 binary.reset(result.get());
87
88 o = dyn_cast<ObjectFile>(binary.get());
89 } else {
90 o = file;
91 binary.reset(file);
92 }
93
94 triple.setArch(Triple::ArchType(o->getArch()));
95 std::string tripleName(triple.getTriple());
96
97 LOG4CXX_INFO(logger, "Architecture " << tripleName);
98
99
100 std::string es;
101 target = TargetRegistry::lookupTarget("", triple, es);
102 if (!target) {
103 LOG4CXX_ERROR(logger, es);
104 return;
105 }
106
107 LOG4CXX_INFO(logger, "Target " << target->getName());
108
109 MRI.reset(target->createMCRegInfo(tripleName));
110 if (!MRI) {
111 LOG4CXX_ERROR(logger, "no register info for target " << tripleName);
112 return;
113 }
114
115 // Set up disassembler.
116 AsmInfo.reset(target->createMCAsmInfo(*MRI, tripleName));
117 if (!AsmInfo) {
118 LOG4CXX_ERROR(logger, "no assembly info for target " << tripleName);
119 return;
120 }
121
122 STI.reset(target->createMCSubtargetInfo(tripleName, "", ""));
123 if (!STI) {
124 LOG4CXX_ERROR(logger, "no subtarget info for target " << tripleName);
125 return;
126 }
127
128 MII.reset(target->createMCInstrInfo());
129 if (!MII) {
130 LOG4CXX_ERROR(logger, "no instruction info for target " << tripleName);
131 return;
132 }
133
134 MOFI.reset(new MCObjectFileInfo);
135 MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
136
137 DisAsm.reset(target->createMCDisassembler(*STI, Ctx));
138 if (!DisAsm) {
139 LOG4CXX_ERROR(logger, "no disassembler for target " << tripleName);
140 return;
141 }
142 RelInfo.reset(
143 target->createMCRelocationInfo(tripleName, Ctx));
144 if (RelInfo) {
145 // Symzer.reset(
146 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
147 // if (Symzer)
148 // DisAsm->setSymbolizer(std::move(Symzer));
149 }
150 RelInfo.release();
151 Symzer.release();
152
153 MIA.reset(target->createMCInstrAnalysis(MII.get()));
154 if (!MIA) {
155 LOG4CXX_ERROR(logger, "no instruction analysis for target " << tripleName);
156 return;
157 }
158
159 int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
160 IP.reset(target->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
161 if (!IP) {
162 LOG4CXX_ERROR(logger, "no instruction printer for target " << tripleName);
163 return;
164 }
165
166 IP->setPrintImmHex(llvm::HexStyle::C);
167 IP->setPrintImmHex(true);
168
169 std::unique_ptr<MCObjectDisassembler> OD(
170 new MCObjectDisassembler(*o, *DisAsm, *MIA));
171 Mod.reset(OD->buildModule(false));
172
173 readSections();
174 }
175
176 template <typename ELFT>
177 void LLVMDisassembler<ELFT>::start() {
178 readSymbols();
179 disassemble();
180 readDynamicSymbols();
181 }
182
183 template <typename ELFT>
184 LLVMDisassembler<ELFT>::~LLVMDisassembler() {}
185
186 template <typename ELFT>
187 Function* LLVMDisassembler<ELFT>::disassembleFunctionAt(uint64_t address, const std::string& name) {
188 Function * function;
189 SectionRef text_section = getTextSection();
190 uint64_t base_address, size;
191 text_section.getAddress(base_address);
192 text_section.getSize(size);
193
194 if (address < base_address ||
195 address >= base_address + size) {
196 return NULL;
197 }
198
199 if (NULL == (function = manager->getFunction(address))) {
200
201 if (name == "") {
202 std::stringstream s;
203 s << "<Unnamed 0x" << std::hex << address << ">";
204 function = manager->newFunction(address);
205 function->setName(s.str());
206 } else {
207 function = manager->newFunction(address);
208 function->setName(name);
209 }
210 disassembleFunction(function);
211 }
212
213 return function;
214 }
215
216 template <typename ELFT>
217 void LLVMDisassembler<ELFT>::disassembleFunction(Function* function) {
218 std::vector<uint64_t> called_functions;
219 std::stack<BasicBlock*> remaining_blocks;
220 /* TODO:
221 * Do all blocks get added properly? We should take care to remove
222 * the other ones at the end of the function!
223 */
224 std::map<uint64_t, BasicBlock*> new_blocks;
225 SectionRef text_section = getTextSection();
226 StringRef bytes;
227 text_section.getContents(bytes);
228 StringRefMemoryObject ref(bytes);
229
230 LOG4CXX_DEBUG(logger, "Handling function " << function->getName());
231
232 BasicBlock * block = manager->newBasicBlock(function->getStartAddress());
233 remaining_blocks.push(block);
234 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
235 function->addBasicBlock(block);
236
237 uint64_t base_address, size;
238 text_section.getAddress(base_address);
239 text_section.getSize(size);
240 LOG4CXX_DEBUG(logger, "Text section at " << std::hex << base_address << " with size " << size);
241
242 while (remaining_blocks.size()) {
243 BasicBlock * current_block = remaining_blocks.top();
244 remaining_blocks.pop();
245
246 LOG4CXX_DEBUG(logger, "Handling Block starting at " << std::hex
247 << current_block->getStartAddress());
248
249 uint64_t inst_size;
250 uint64_t current_address = current_block->getStartAddress() - base_address;
251 while(true) {
252 MCInst inst;
253 std::string buf;
254 llvm::raw_string_ostream s(buf);
255
256 if(llvm::MCDisassembler::Success ==
257 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
258 uint64_t jmptarget;
259
260 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
261 jmptarget += base_address;
262 if (!MIA->isIndirectBranch(inst)) {
263 if (MIA->isCall(inst)) {
264 if (NULL == manager->getFunction(jmptarget))
265 called_functions.push_back(jmptarget);
266 } else {
267 current_block->setNextBlock(0, jmptarget);
268 if (new_blocks.find(jmptarget) == new_blocks.end()) {
269 BasicBlock * block = manager->newBasicBlock(jmptarget);
270 assert(block);
271 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
272 function->addBasicBlock(block);
273 remaining_blocks.push(block);
274 } else {
275 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
276 << current_block->getStartAddress());
277 function->addBasicBlock(new_blocks.find(jmptarget)->second);
278 }
279 if (MIA->isConditionalBranch(inst)) {
280 jmptarget = base_address + current_address + inst_size;
281 current_block->setNextBlock(1, jmptarget);
282 if (new_blocks.find(jmptarget) == new_blocks.end()) {
283 BasicBlock * block = manager->newBasicBlock(jmptarget);
284 assert(block);
285 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
286 function->addBasicBlock(block);
287 remaining_blocks.push(block);
288 } else {
289 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
290 << current_block->getStartAddress());
291 function->addBasicBlock(new_blocks.find(jmptarget)->second);
292 }
293 }
294 }
295 }
296 }
297 } else {
298 inst_size = 0;
299 }
300
301
302 if (inst_size == 0 || MIA->isTerminator(inst) || MIA->isBranch(inst)) {
303 current_block->setEndAddress(current_address + base_address + inst_size);
304 LOG4CXX_DEBUG(logger, "Finished Block at " << std::hex <<
305 current_block->getEndAddress());
306 break;
307 }
308 current_address += inst_size;
309 }
310 }
311 splitBlocks(function);
312 LOG4CXX_DEBUG(logger, "Finished function " << function->getName());
313 manager->finishFunction(function);
314 for (uint64_t address : called_functions)
315 disassembleFunctionAt(address);
316 }
317
318 template <typename ELFT>
319 void LLVMDisassembler<ELFT>::disassemble() {
320 SectionRef text_section = getTextSection();
321 std::vector<Function*> remaining_functions;
322
323 // Assume all function symbols actually start a real function
324 for (auto x = symbols.begin(); x != symbols.end(); ++x) {
325 uint64_t result;
326 bool contains;
327 SymbolRef::Type symbol_type;
328
329
330 if (text_section.containsSymbol(x->second, contains) || !contains)
331 continue;
332
333 if (x->second.getType(symbol_type)
334 || SymbolRef::ST_Function != symbol_type)
335 continue;
336
337 if (!x->second.getAddress(result)) {
338 Function * fun = manager->newFunction(result);
339 if (fun) {
340 fun->setName(x->first);
341 remaining_functions.push_back(fun);
342 LOG4CXX_DEBUG(logger, "Disasembling " << x->first);
343 } else {
344 LOG4CXX_DEBUG(logger, "Function at " << std::hex << result
345 << " already disassembled as " << manager->getFunction(result)->getName());
346 }
347 }
348 }
349
350 for (Function* function : remaining_functions) {
351 disassembleFunction(function);
352 manager->finishFunction(function);
353 }
354
355 if (binary->isELF()) {
356 uint64_t _entryAddress = entryAddress();
357 LOG4CXX_DEBUG(logger, "Adding entryAddress at: " << std::hex << _entryAddress);
358 std::stringstream s;
359 s << "<_start 0x" << std::hex << _entryAddress << ">";
360
361 disassembleFunctionAt(_entryAddress, s.str());
362 }
363
364 if (!manager->hasFunctions()) {
365 uint64_t text_entry;
366 text_section.getAddress(text_entry);
367 LOG4CXX_INFO(logger, "No Symbols found, starting at the beginning of the text segment");
368 disassembleFunctionAt(text_entry);
369 }
370 }
371
372 template <>
373 uint64_t LLVMDisassembler<COFFT>::entryAddress() {
374 const auto coffobject = dyn_cast<COFFObjectFile>(o);
375 const struct pe32_header* pe32_header;
376 const struct pe32plus_header* pe32plus_header;
377
378 coffobject->getPE32PlusHeader(pe32plus_header);
379
380 if (pe32plus_header) {
381 return pe32plus_header->AddressOfEntryPoint;
382 } else {
383 coffobject->getPE32Header(pe32_header);
384 return pe32_header->AddressOfEntryPoint;
385 }
386 }
387
388 template<>
389 uint64_t LLVMDisassembler<MACHOT>::entryAddress() {
390 // TODO
391 return 0;
392 }
393
394 template <typename ELFT>
395 uint64_t LLVMDisassembler<ELFT>::entryAddress() {
396 const auto elffile = dyn_cast<ELFObjectFile<ELFT>>(o)->getELFFile();
397 const auto * header = elffile->getHeader();
398
399 return header->e_entry;
400 }
401
402 template <typename ELFT>
403 void LLVMDisassembler<ELFT>::splitBlocks(Function* function) {
404 SectionRef text_section = getTextSection();
405 StringRef bytes;
406 text_section.getContents(bytes);
407 StringRefMemoryObject ref(bytes);
408
409 LOG4CXX_DEBUG(logger, "Splitting Blocks in Function " << function->getName());
410 // Split blocks where jumps are going inside the block
411 for (auto it = function->blocks().begin();
412 it != function->blocks().end();
413 ++it) {
414 BasicBlock * current_block = it->second;
415 if (current_block->getEndAddress() == 0) {
416 LOG4CXX_ERROR(logger, "UNFINISHED BLOCK " << std::hex << current_block->getStartAddress());
417 break;
418 }
419 uint64_t inst_size;
420 uint64_t base_address;
421 text_section.getAddress(base_address);
422 uint64_t current_address = current_block->getStartAddress() - base_address;
423 while(current_block->getEndAddress() - base_address > current_address) {
424 MCInst inst;
425 std::string buf;
426 llvm::raw_string_ostream s(buf);
427
428 if(llvm::MCDisassembler::Success ==
429 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
430 // See if some other block starts here
431 BasicBlock* other = manager->getBasicBlock(current_address
432 + inst_size
433 + base_address);
434
435 // Special case, other block starts here but we are at the end anyway
436 if (other != NULL) {
437 uint64_t endaddress = current_address + inst_size + base_address;
438 if (endaddress != current_block->getEndAddress()) {
439 LOG4CXX_DEBUG(logger, "Shortening block starting at "
440 << std::hex
441 << current_block->getStartAddress()
442 << " now ending at "
443 << other->getStartAddress());
444 function->addBasicBlock(other);
445 current_block->setEndAddress(endaddress);
446 current_block->setNextBlock(0, other->getStartAddress());
447 current_block->setNextBlock(1, 0);
448 }
449 }
450 } else {
451 inst_size = 1;
452 }
453 current_address += inst_size;
454 }
455 }
456 }
457
458 template<>
459 void LLVMDisassembler<COFFT>::readDynamicSymbols() {
460 //TODO
461 }
462
463 template<>
464 void LLVMDisassembler<MACHOT>::readDynamicSymbols() {
465 //TODO
466 }
467
468 template <typename ELFT>
469 void LLVMDisassembler<ELFT>::readDynamicSymbols() {
470 const auto elffile = dyn_cast<ELFObjectFile<ELFT>>(o)->getELFFile();
471 for (auto it = elffile->begin_dynamic_symbols(),
472 end = elffile->end_dynamic_symbols();
473 it != end;
474 ++it) {
475 if (it->getType() == 2) { // Function
476 bool is_default;
477 // TODO: Error handling
478 std::string symbolname = *(elffile->getSymbolName(it));
479 std::string symbolversion = *(elffile->getSymbolVersion(nullptr, &*it, is_default));
480 // TODO: actually get the symbol address from relocations
481 Function* f = manager->newDynamicFunction(0);
482 f->setName(symbolname + (is_default? "@@" : "@") + symbolversion);
483 manager->finishFunction(f);
484
485 LOG4CXX_DEBUG(logger, "Adding dynamic Symbol " << symbolname << (is_default? "@@" : "@") << symbolversion);
486 }
487 }
488 }
489
490 template <typename ELFT>
491 void LLVMDisassembler<ELFT>::readSymbols() {
492 error_code ec;
493 symbol_iterator si(o->symbol_begin()), se(o->symbol_end());
494 for (; si != se; ++si) {
495 StringRef name;
496 if ((ec = si->getName(name))) {
497 LOG4CXX_ERROR(logger, ec.message());
498 break;
499 }
500 LOG4CXX_DEBUG(logger, "Added symbol " << name.str());
501 symbols.insert(make_pair(name.str(), *si));
502 }
503 }
504
505 template <typename ELFT>
506 void LLVMDisassembler<ELFT>::readSections() {
507 error_code ec;
508 section_iterator i(o->section_begin()), e(o->section_end());
509 for (; i != e; ++i) {
510 StringRef name;
511 if ((ec = i->getName(name))) {
512 LOG4CXX_ERROR(logger, ec.message());
513 break;
514 }
515 LOG4CXX_DEBUG(logger, "Added section " << name.str());
516 sections.insert(make_pair(name.str(), *i));
517 }
518
519 }
520
521 // template <typename ELFT>
522 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
523 // // std::for_each(functions.begin(), functions.end(),
524 // // [&](std::pair<uint64_t, Function*> x) {
525 // // callback(x.first, x.second);
526 // // });
527 // }
528
529 template <typename ELFT>
530 std::vector<Instruction> LLVMDisassembler<ELFT>::getInstructions(const BasicBlock *block) {
531 std::vector<Instruction> result;
532 SectionRef text_section = getTextSection();
533 uint64_t base_address;
534 text_section.getAddress(base_address);
535 uint64_t current_address = block->getStartAddress() - base_address;
536 uint64_t end_position = block->getEndAddress() - base_address;
537
538 StringRef bytes;
539 text_section.getContents(bytes);
540 StringRefMemoryObject ref(bytes);
541
542 while (current_address < end_position) {
543 uint64_t inst_size;
544 MCInst inst;
545 std::string buf;
546 llvm::raw_string_ostream s(buf);
547
548 if(llvm::MCDisassembler::Success ==
549 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
550
551 uint8_t bytes[inst_size+2];
552 ref.readBytes(current_address, inst_size, bytes);
553
554 uint64_t jmptarget;
555 std::string ref("");
556 IP->printInst(&inst, s, "");
557 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
558 std::stringstream stream;
559 if (MIA->isCall(inst))
560 stream << "function:";
561 else
562 stream << "block:";
563
564 stream << std::hex << (base_address + jmptarget);
565 ref = stream.str();
566 }
567 result.push_back(Instruction(current_address + base_address, boost::algorithm::trim_copy(s.str()),
568 std::vector<uint8_t>(bytes, bytes+inst_size), ref));
569 } else {
570 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
571 uint8_t bytes[1];
572 ref.readBytes(current_address, 1, bytes);
573 result.push_back(Instruction(current_address + base_address, "Invalid Instruction",
574 std::vector<uint8_t>(bytes, bytes+1), ""));
575 inst_size = 1;
576 }
577
578 current_address += inst_size;
579 }
580 return result;
581 }
582
583 template <typename ELFT>
584 void LLVMDisassembler<ELFT>::printEachInstruction(uint64_t start, uint64_t end,
585 std::function<void (uint8_t*, size_t,
586 const std::string&,
587 const std::string&)> fun) {
588 SectionRef text_section = getTextSection();
589 uint64_t base_address;
590 text_section.getAddress(base_address);
591 uint64_t current_address = start - base_address;
592
593 StringRef bytes;
594 text_section.getContents(bytes);
595 StringRefMemoryObject ref(bytes);
596
597 while (current_address < end - base_address) {
598 uint64_t inst_size;
599 MCInst inst;
600 std::string buf;
601 llvm::raw_string_ostream s(buf);
602
603 if(llvm::MCDisassembler::Success ==
604 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
605
606 uint8_t bytes[inst_size+2];
607 ref.readBytes(current_address, inst_size, bytes);
608
609 uint64_t jmptarget;
610 std::string ref("");
611 IP->printInst(&inst, s, "");
612 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
613 std::stringstream stream;
614 if (MIA->isCall(inst))
615 stream << "function:";
616 else
617 stream << "block:";
618
619 stream << std::hex << (base_address + jmptarget);
620 ref = stream.str();
621 }
622
623
624 fun(bytes, inst_size, s.str(), ref);
625 } else {
626 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
627 fun(NULL, 0, "Invalid Byte", "");
628 inst_size = 1;
629 }
630
631 current_address += inst_size;
632 }
633 }
634
635 template <typename ELFT>
636 SectionRef LLVMDisassembler<ELFT>::getTextSection() {
637 return sections[".text"];
638 }
639
640 template <>
641 SectionRef LLVMDisassembler<MACHOT>::getTextSection() {
642 return sections["__text"];
643 }