]> git.siccegge.de Git - frida/frida.git/blob - src/disassembler/llvm/LLVMDisassembler.cxx
Rework API for getting at instructions
[frida/frida.git] / src / disassembler / llvm / LLVMDisassembler.cxx
1 #include "disassembler/Instruction.hxx"
2 #include "disassembler/llvm/LLVMDisassembler.hxx"
3 #include "core/InformationManager.hxx"
4 #include "core/Function.hxx"
5 #include "core/BasicBlock.hxx"
6
7 #include <stack>
8 #include <algorithm>
9 #include <cassert>
10
11 using namespace llvm;
12 using namespace llvm::object;
13 using std::error_code;
14
15 namespace {
16 class COFFT {
17
18 };
19
20 class MACHOT {
21
22 };
23 }
24
25 /*
26 *
27 */
28 Disassembler * createLLVMDisassembler(const std::string& filename, InformationManager* manager) {
29 if (filename == "")
30 return NULL;
31
32 std::unique_ptr<Binary> o;
33 o.reset(createBinary(filename).get());
34 Binary * op = o.release();
35
36 // ELFType<endian, maxalign, 64bit>
37 if (ELF32LEObjectFile * object = dyn_cast<ELF32LEObjectFile>(op)) {
38 return new LLVMDisassembler<ELFType<support::little, 2, false>>(filename, manager, object);
39 }
40 if (ELF64LEObjectFile * object = dyn_cast<ELF64LEObjectFile>(op)) {
41 return new LLVMDisassembler<ELFType<support::little, 2, true>>(filename, manager, object);
42 }
43 if (ELF32BEObjectFile * object = dyn_cast<ELF32BEObjectFile>(op)) {
44 return new LLVMDisassembler<ELFType<support::big, 2, false>>(filename, manager, object);
45 }
46 if (ELF64BEObjectFile * object = dyn_cast<ELF64BEObjectFile>(op)) {
47 return new LLVMDisassembler<ELFType<support::big, 2, true>>(filename, manager, object);
48 }
49 if (COFFObjectFile * object = dyn_cast<COFFObjectFile>(op)) {
50 return new LLVMDisassembler<COFFT>(filename, manager, object);
51 }
52 if (MachOObjectFile * object = dyn_cast<MachOObjectFile>(op)) {
53 return new LLVMDisassembler<MACHOT>(filename, manager, object);
54 }
55
56 return NULL;
57 }
58
59 /*
60 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
61 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
62 * foo
63 */
64 template <typename ELFT>
65 LLVMDisassembler<ELFT>::LLVMDisassembler(const std::string& filename,
66 InformationManager* manager,
67 ObjectFile* file)
68 : Disassembler()
69 , logger(log4cxx::Logger::getLogger("disassembler.LLVMDisassembler"))
70 , triple("unknown-unknown-unknown")
71 , manager(manager)
72 {
73 LOG4CXX_DEBUG(logger, "Handling file " << filename);
74
75 if (!file) {
76 auto result = createBinary(filename);
77
78 error_code ec;
79 if ((ec = result.getError())) {
80 LOG4CXX_ERROR(logger, "Failed to load Binary" << ec.message());
81 binary = NULL;
82 return;
83 }
84
85 binary.reset(result.get());
86
87 o = dyn_cast<ObjectFile>(binary.get());
88 } else {
89 o = file;
90 binary.reset(file);
91 }
92
93 triple.setArch(Triple::ArchType(o->getArch()));
94 std::string tripleName(triple.getTriple());
95
96 LOG4CXX_INFO(logger, "Architecture " << tripleName);
97
98
99 std::string es;
100 target = TargetRegistry::lookupTarget("", triple, es);
101 if (!target) {
102 LOG4CXX_ERROR(logger, es);
103 return;
104 }
105
106 LOG4CXX_INFO(logger, "Target " << target->getName());
107
108 MRI.reset(target->createMCRegInfo(tripleName));
109 if (!MRI) {
110 LOG4CXX_ERROR(logger, "no register info for target " << tripleName);
111 return;
112 }
113
114 // Set up disassembler.
115 AsmInfo.reset(target->createMCAsmInfo(*MRI, tripleName));
116 if (!AsmInfo) {
117 LOG4CXX_ERROR(logger, "no assembly info for target " << tripleName);
118 return;
119 }
120
121 STI.reset(target->createMCSubtargetInfo(tripleName, "", ""));
122 if (!STI) {
123 LOG4CXX_ERROR(logger, "no subtarget info for target " << tripleName);
124 return;
125 }
126
127 MII.reset(target->createMCInstrInfo());
128 if (!MII) {
129 LOG4CXX_ERROR(logger, "no instruction info for target " << tripleName);
130 return;
131 }
132
133 MOFI.reset(new MCObjectFileInfo);
134 MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
135
136 DisAsm.reset(target->createMCDisassembler(*STI, Ctx));
137 if (!DisAsm) {
138 LOG4CXX_ERROR(logger, "no disassembler for target " << tripleName);
139 return;
140 }
141 RelInfo.reset(
142 target->createMCRelocationInfo(tripleName, Ctx));
143 if (RelInfo) {
144 // Symzer.reset(
145 // MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
146 // if (Symzer)
147 // DisAsm->setSymbolizer(std::move(Symzer));
148 }
149 RelInfo.release();
150 Symzer.release();
151
152 MIA.reset(target->createMCInstrAnalysis(MII.get()));
153 if (!MIA) {
154 LOG4CXX_ERROR(logger, "no instruction analysis for target " << tripleName);
155 return;
156 }
157
158 int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
159 IP.reset(target->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
160 if (!IP) {
161 LOG4CXX_ERROR(logger, "no instruction printer for target " << tripleName);
162 return;
163 }
164
165 IP->setPrintImmHex(llvm::HexStyle::C);
166 IP->setPrintImmHex(true);
167
168 std::unique_ptr<MCObjectDisassembler> OD(
169 new MCObjectDisassembler(*o, *DisAsm, *MIA));
170 Mod.reset(OD->buildModule(false));
171
172 readSections();
173 }
174
175 template <typename ELFT>
176 void LLVMDisassembler<ELFT>::start() {
177 readSymbols();
178 disassemble();
179 readDynamicSymbols();
180 }
181
182 template <typename ELFT>
183 LLVMDisassembler<ELFT>::~LLVMDisassembler() {}
184
185 template <typename ELFT>
186 Function* LLVMDisassembler<ELFT>::disassembleFunctionAt(uint64_t address, const std::string& name) {
187 Function * function;
188 SectionRef text_section = getTextSection();
189 uint64_t base_address, size;
190 text_section.getAddress(base_address);
191 text_section.getSize(size);
192
193 if (address < base_address ||
194 address >= base_address + size) {
195 return NULL;
196 }
197
198 if (NULL == (function = manager->getFunction(address))) {
199
200 if (name == "") {
201 std::stringstream s;
202 s << "<Unnamed 0x" << std::hex << address << ">";
203 function = manager->newFunction(address);
204 function->setName(s.str());
205 } else {
206 function = manager->newFunction(address);
207 function->setName(name);
208 }
209 disassembleFunction(function);
210 }
211
212 return function;
213 }
214
215 template <typename ELFT>
216 void LLVMDisassembler<ELFT>::disassembleFunction(Function* function) {
217 std::vector<uint64_t> called_functions;
218 std::stack<BasicBlock*> remaining_blocks;
219 /* TODO:
220 * Do all blocks get added properly? We should take care to remove
221 * the other ones at the end of the function!
222 */
223 std::map<uint64_t, BasicBlock*> new_blocks;
224 SectionRef text_section = getTextSection();
225 StringRef bytes;
226 text_section.getContents(bytes);
227 StringRefMemoryObject ref(bytes);
228
229 LOG4CXX_DEBUG(logger, "Handling function " << function->getName());
230
231 BasicBlock * block = manager->newBasicBlock(function->getStartAddress());
232 remaining_blocks.push(block);
233 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
234 function->addBasicBlock(block);
235
236 uint64_t base_address, size;
237 text_section.getAddress(base_address);
238 text_section.getSize(size);
239 LOG4CXX_DEBUG(logger, "Text section at " << std::hex << base_address << " with size " << size);
240
241 while (remaining_blocks.size()) {
242 BasicBlock * current_block = remaining_blocks.top();
243 remaining_blocks.pop();
244
245 LOG4CXX_DEBUG(logger, "Handling Block starting at " << std::hex
246 << current_block->getStartAddress());
247
248 uint64_t inst_size;
249 uint64_t current_address = current_block->getStartAddress() - base_address;
250 while(true) {
251 MCInst inst;
252 std::string buf;
253 llvm::raw_string_ostream s(buf);
254
255 if(llvm::MCDisassembler::Success ==
256 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
257 uint64_t jmptarget;
258
259 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
260 jmptarget += base_address;
261 if (!MIA->isIndirectBranch(inst)) {
262 if (MIA->isCall(inst)) {
263 if (NULL == manager->getFunction(jmptarget))
264 called_functions.push_back(jmptarget);
265 } else {
266 current_block->setNextBlock(0, jmptarget);
267 if (new_blocks.find(jmptarget) == new_blocks.end()) {
268 BasicBlock * block = manager->newBasicBlock(jmptarget);
269 assert(block);
270 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
271 function->addBasicBlock(block);
272 remaining_blocks.push(block);
273 } else {
274 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
275 << current_block->getStartAddress());
276 function->addBasicBlock(new_blocks.find(jmptarget)->second);
277 }
278 if (MIA->isConditionalBranch(inst)) {
279 jmptarget = base_address + current_address + inst_size;
280 current_block->setNextBlock(1, jmptarget);
281 if (new_blocks.find(jmptarget) == new_blocks.end()) {
282 BasicBlock * block = manager->newBasicBlock(jmptarget);
283 assert(block);
284 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
285 function->addBasicBlock(block);
286 remaining_blocks.push(block);
287 } else {
288 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
289 << current_block->getStartAddress());
290 function->addBasicBlock(new_blocks.find(jmptarget)->second);
291 }
292 }
293 }
294 }
295 }
296 } else {
297 inst_size = 0;
298 }
299
300
301 if (inst_size == 0 || MIA->isTerminator(inst) || MIA->isBranch(inst)) {
302 current_block->setEndAddress(current_address + base_address + inst_size);
303 LOG4CXX_DEBUG(logger, "Finished Block at " << std::hex <<
304 current_block->getEndAddress());
305 break;
306 }
307 current_address += inst_size;
308 }
309 }
310 splitBlocks(function);
311 LOG4CXX_DEBUG(logger, "Finished function " << function->getName());
312 manager->finishFunction(function);
313 for (uint64_t address : called_functions)
314 disassembleFunctionAt(address);
315 }
316
317 template <typename ELFT>
318 void LLVMDisassembler<ELFT>::disassemble() {
319 SectionRef text_section = getTextSection();
320 std::vector<Function*> remaining_functions;
321
322 // Assume all function symbols actually start a real function
323 for (auto x = symbols.begin(); x != symbols.end(); ++x) {
324 uint64_t result;
325 bool contains;
326 SymbolRef::Type symbol_type;
327
328
329 if (text_section.containsSymbol(x->second, contains) || !contains)
330 continue;
331
332 if (x->second.getType(symbol_type)
333 || SymbolRef::ST_Function != symbol_type)
334 continue;
335
336 if (!x->second.getAddress(result)) {
337 Function * fun = manager->newFunction(result);
338 if (fun) {
339 fun->setName(x->first);
340 remaining_functions.push_back(fun);
341 LOG4CXX_DEBUG(logger, "Disasembling " << x->first);
342 } else {
343 LOG4CXX_DEBUG(logger, "Function at " << std::hex << result
344 << " already disassembled as " << manager->getFunction(result)->getName());
345 }
346 }
347 }
348
349 for (Function* function : remaining_functions) {
350 disassembleFunction(function);
351 manager->finishFunction(function);
352 }
353
354 if (binary->isELF()) {
355 uint64_t _entryAddress = entryAddress();
356 LOG4CXX_DEBUG(logger, "Adding entryAddress at: " << std::hex << _entryAddress);
357 std::stringstream s;
358 s << "<_start 0x" << std::hex << _entryAddress << ">";
359
360 disassembleFunctionAt(_entryAddress, s.str());
361 }
362
363 if (!manager->hasFunctions()) {
364 uint64_t text_entry;
365 text_section.getAddress(text_entry);
366 LOG4CXX_INFO(logger, "No Symbols found, starting at the beginning of the text segment");
367 disassembleFunctionAt(text_entry);
368 }
369 }
370
371 template <>
372 uint64_t LLVMDisassembler<COFFT>::entryAddress() {
373 const auto coffobject = dyn_cast<COFFObjectFile>(o);
374 const struct pe32_header* pe32_header;
375 const struct pe32plus_header* pe32plus_header;
376
377 coffobject->getPE32PlusHeader(pe32plus_header);
378
379 if (pe32plus_header) {
380 return pe32plus_header->AddressOfEntryPoint;
381 } else {
382 coffobject->getPE32Header(pe32_header);
383 return pe32_header->AddressOfEntryPoint;
384 }
385 }
386
387 template<>
388 uint64_t LLVMDisassembler<MACHOT>::entryAddress() {
389 // TODO
390 return 0;
391 }
392
393 template <typename ELFT>
394 uint64_t LLVMDisassembler<ELFT>::entryAddress() {
395 const auto elffile = dyn_cast<ELFObjectFile<ELFT>>(o)->getELFFile();
396 const auto * header = elffile->getHeader();
397
398 return header->e_entry;
399 }
400
401 template <typename ELFT>
402 void LLVMDisassembler<ELFT>::splitBlocks(Function* function) {
403 SectionRef text_section = getTextSection();
404 StringRef bytes;
405 text_section.getContents(bytes);
406 StringRefMemoryObject ref(bytes);
407
408 LOG4CXX_DEBUG(logger, "Splitting Blocks in Function " << function->getName());
409 // Split blocks where jumps are going inside the block
410 for (auto it = function->blocks().begin();
411 it != function->blocks().end();
412 ++it) {
413 BasicBlock * current_block = it->second;
414 if (current_block->getEndAddress() == 0) {
415 LOG4CXX_ERROR(logger, "UNFINISHED BLOCK " << std::hex << current_block->getStartAddress());
416 break;
417 }
418 uint64_t inst_size;
419 uint64_t base_address;
420 text_section.getAddress(base_address);
421 uint64_t current_address = current_block->getStartAddress() - base_address;
422 while(current_block->getEndAddress() - base_address > current_address) {
423 MCInst inst;
424 std::string buf;
425 llvm::raw_string_ostream s(buf);
426
427 if(llvm::MCDisassembler::Success ==
428 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
429 // See if some other block starts here
430 BasicBlock* other = manager->getBasicBlock(current_address
431 + inst_size
432 + base_address);
433
434 // Special case, other block starts here but we are at the end anyway
435 if (other != NULL) {
436 uint64_t endaddress = current_address + inst_size + base_address;
437 if (endaddress != current_block->getEndAddress()) {
438 LOG4CXX_DEBUG(logger, "Shortening block starting at "
439 << std::hex
440 << current_block->getStartAddress()
441 << " now ending at "
442 << other->getStartAddress());
443 function->addBasicBlock(other);
444 current_block->setEndAddress(endaddress);
445 current_block->setNextBlock(0, other->getStartAddress());
446 current_block->setNextBlock(1, 0);
447 }
448 }
449 } else {
450 inst_size = 1;
451 }
452 current_address += inst_size;
453 }
454 }
455 }
456
457 template<>
458 void LLVMDisassembler<COFFT>::readDynamicSymbols() {
459 //TODO
460 }
461
462 template<>
463 void LLVMDisassembler<MACHOT>::readDynamicSymbols() {
464 //TODO
465 }
466
467 template <typename ELFT>
468 void LLVMDisassembler<ELFT>::readDynamicSymbols() {
469 const auto elffile = dyn_cast<ELFObjectFile<ELFT>>(o)->getELFFile();
470 for (auto it = elffile->begin_dynamic_symbols(),
471 end = elffile->end_dynamic_symbols();
472 it != end;
473 ++it) {
474 if (it->getType() == 2) { // Function
475 bool is_default;
476 // TODO: Error handling
477 std::string symbolname = *(elffile->getSymbolName(it));
478 std::string symbolversion = *(elffile->getSymbolVersion(nullptr, &*it, is_default));
479 // TODO: actually get the symbol address from relocations
480 Function* f = manager->newDynamicFunction(0);
481 f->setName(symbolname + (is_default? "@@" : "@") + symbolversion);
482 manager->finishFunction(f);
483
484 LOG4CXX_DEBUG(logger, "Adding dynamic Symbol " << symbolname << (is_default? "@@" : "@") << symbolversion);
485 }
486 }
487 }
488
489 template <typename ELFT>
490 void LLVMDisassembler<ELFT>::readSymbols() {
491 error_code ec;
492 symbol_iterator si(o->symbol_begin()), se(o->symbol_end());
493 for (; si != se; ++si) {
494 StringRef name;
495 if ((ec = si->getName(name))) {
496 LOG4CXX_ERROR(logger, ec.message());
497 break;
498 }
499 LOG4CXX_DEBUG(logger, "Added symbol " << name.str());
500 symbols.insert(make_pair(name.str(), *si));
501 }
502 }
503
504 template <typename ELFT>
505 void LLVMDisassembler<ELFT>::readSections() {
506 error_code ec;
507 section_iterator i(o->section_begin()), e(o->section_end());
508 for (; i != e; ++i) {
509 StringRef name;
510 if ((ec = i->getName(name))) {
511 LOG4CXX_ERROR(logger, ec.message());
512 break;
513 }
514 LOG4CXX_DEBUG(logger, "Added section " << name.str());
515 sections.insert(make_pair(name.str(), *i));
516 }
517
518 }
519
520 // template <typename ELFT>
521 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
522 // // std::for_each(functions.begin(), functions.end(),
523 // // [&](std::pair<uint64_t, Function*> x) {
524 // // callback(x.first, x.second);
525 // // });
526 // }
527
528 template <typename ELFT>
529 std::list<Instruction> LLVMDisassembler<ELFT>::getInstructions(const BasicBlock *block) {
530 std::list<Instruction> result;
531 SectionRef text_section = getTextSection();
532 uint64_t base_address;
533 text_section.getAddress(base_address);
534 uint64_t current_address = block->getStartAddress() - base_address;
535 uint64_t end_position = block->getEndAddress() - base_address;
536
537 StringRef bytes;
538 text_section.getContents(bytes);
539 StringRefMemoryObject ref(bytes);
540
541 while (current_address < end_position) {
542 uint64_t inst_size;
543 MCInst inst;
544 std::string buf;
545 llvm::raw_string_ostream s(buf);
546
547 if(llvm::MCDisassembler::Success ==
548 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
549
550 uint8_t bytes[inst_size+2];
551 ref.readBytes(current_address, inst_size, bytes);
552
553 uint64_t jmptarget;
554 std::string ref("");
555 IP->printInst(&inst, s, "");
556 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
557 std::stringstream stream;
558 if (MIA->isCall(inst))
559 stream << "function:";
560 else
561 stream << "block:";
562
563 stream << std::hex << (base_address + jmptarget);
564 ref = stream.str();
565 }
566 result.push_back(Instruction(current_address + base_address, s.str(),
567 std::vector<uint8_t>(bytes, bytes+inst_size), ref));
568 } else {
569 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
570 uint8_t bytes[1];
571 ref.readBytes(current_address, 1, bytes);
572 result.push_back(Instruction(current_address + base_address, "Invalid Instruction",
573 std::vector<uint8_t>(bytes, bytes+1), ""));
574 inst_size = 1;
575 }
576
577 current_address += inst_size;
578 }
579 return result;
580 }
581
582 template <typename ELFT>
583 void LLVMDisassembler<ELFT>::printEachInstruction(uint64_t start, uint64_t end,
584 std::function<void (uint8_t*, size_t,
585 const std::string&,
586 const std::string&)> fun) {
587 SectionRef text_section = getTextSection();
588 uint64_t base_address;
589 text_section.getAddress(base_address);
590 uint64_t current_address = start - base_address;
591
592 StringRef bytes;
593 text_section.getContents(bytes);
594 StringRefMemoryObject ref(bytes);
595
596 while (current_address < end - base_address) {
597 uint64_t inst_size;
598 MCInst inst;
599 std::string buf;
600 llvm::raw_string_ostream s(buf);
601
602 if(llvm::MCDisassembler::Success ==
603 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
604
605 uint8_t bytes[inst_size+2];
606 ref.readBytes(current_address, inst_size, bytes);
607
608 uint64_t jmptarget;
609 std::string ref("");
610 IP->printInst(&inst, s, "");
611 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
612 std::stringstream stream;
613 if (MIA->isCall(inst))
614 stream << "function:";
615 else
616 stream << "block:";
617
618 stream << std::hex << (base_address + jmptarget);
619 ref = stream.str();
620 }
621
622
623 fun(bytes, inst_size, s.str(), ref);
624 } else {
625 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
626 fun(NULL, 0, "Invalid Byte", "");
627 inst_size = 1;
628 }
629
630 current_address += inst_size;
631 }
632 }
633
634 template <typename ELFT>
635 SectionRef LLVMDisassembler<ELFT>::getTextSection() {
636 return sections[".text"];
637 }
638
639 template <>
640 SectionRef LLVMDisassembler<MACHOT>::getTextSection() {
641 return sections["__text"];
642 }