Make InformationManager responsible for cleaning Blocks / Functions
[frida/frida.git] / src / disassembler / llvm / LLVMDisassembler.cxx
1 #include "disassembler/llvm/LLVMDisassembler.hxx"
2 #include "disassembler/llvm/LLVMBasicBlock.hxx"
3 #include "disassembler/llvm/LLVMFunction.hxx"
4 #include "core/InformationManager.hxx"
5
6 #include <stack>
7 #include <algorithm>
8
9 using namespace llvm;
10 using namespace llvm::object;
11 using std::error_code;
12
13 /*
14 *
15 */
16 Disassembler * createLLVMDisassembler(const std::string& filename, InformationManager* manager) {
17 if (filename == "")
18 return NULL;
19
20 std::unique_ptr<Binary> o;
21 o.reset(createBinary(filename).get());
22 Binary * op = o.release();
23
24 // ELFType<endian, maxalign, 64bit>
25 if (ELF32LEObjectFile * object = dyn_cast<ELF32LEObjectFile>(op)) {
26 return new LLVMDisassembler<ELFType<support::little, 2, false>>(filename, manager, object);
27 }
28 if (ELF64LEObjectFile * object = dyn_cast<ELF64LEObjectFile>(op)) {
29 return new LLVMDisassembler<ELFType<support::little, 2, true>>(filename, manager, object);
30 }
31 if (ELF32BEObjectFile * object = dyn_cast<ELF32BEObjectFile>(op)) {
32 return new LLVMDisassembler<ELFType<support::big, 2, false>>(filename, manager, object);
33 }
34 if (ELF64BEObjectFile * object = dyn_cast<ELF64BEObjectFile>(op)) {
35 return new LLVMDisassembler<ELFType<support::big, 2, true>>(filename, manager, object);
36 }
37
38 return NULL;
39 }
40
41 /*
42 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
43 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
44 * foo
45 */
46 template <typename ELFT>
47 LLVMDisassembler<ELFT>::LLVMDisassembler(const std::string& filename,
48 InformationManager* manager,
49 ELFObjectFile<ELFT>* file)
50 : Disassembler(filename, manager)
51 , logger(log4cxx::Logger::getLogger("LLVMDisassembler"))
52 , triple("unknown-unknown-unknown")
53 , manager(manager)
54 {
55 LOG4CXX_DEBUG(logger, "Handling file" << filename);
56
57 if (!file) {
58 auto result = createBinary(filename);
59
60 error_code ec;
61 if ((ec = result.getError())) {
62 LOG4CXX_ERROR(logger, "Failed to load Binary" << ec.message());
63 binary = NULL;
64 return;
65 }
66
67 binary.reset(result.get());
68
69 o = dyn_cast<ELFObjectFile<ELFT>>(binary.get());
70 } else {
71 o = file;
72 binary.reset(file);
73 }
74
75 triple.setArch(Triple::ArchType(o->getArch()));
76 std::string tripleName(triple.getTriple());
77
78 LOG4CXX_INFO(logger, "Architecture " << tripleName);
79
80
81 std::string es;
82 target = TargetRegistry::lookupTarget("", triple, es);
83 if (!target) {
84 LOG4CXX_ERROR(logger, es);
85 return;
86 }
87
88 LOG4CXX_INFO(logger, "Target " << target->getName());
89
90 MRI.reset(target->createMCRegInfo(tripleName));
91 if (!MRI) {
92 LOG4CXX_ERROR(logger, "no register info for target " << tripleName);
93 return;
94 }
95
96 // Set up disassembler.
97 AsmInfo.reset(target->createMCAsmInfo(*MRI, tripleName));
98 if (!AsmInfo) {
99 LOG4CXX_ERROR(logger, "no assembly info for target " << tripleName);
100 return;
101 }
102
103 STI.reset(target->createMCSubtargetInfo(tripleName, "", ""));
104 if (!STI) {
105 LOG4CXX_ERROR(logger, "no subtarget info for target " << tripleName);
106 return;
107 }
108
109 MII.reset(target->createMCInstrInfo());
110 if (!MII) {
111 LOG4CXX_ERROR(logger, "no instruction info for target " << tripleName);
112 return;
113 }
114
115 MOFI.reset(new MCObjectFileInfo);
116 MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
117
118 DisAsm.reset(target->createMCDisassembler(*STI, Ctx));
119 if (!DisAsm) {
120 LOG4CXX_ERROR(logger, "no disassembler for target " << tripleName);
121 return;
122 }
123 RelInfo.reset(
124 target->createMCRelocationInfo(tripleName, Ctx));
125 if (RelInfo) {
126 Symzer.reset(
127 MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
128 if (Symzer)
129 DisAsm->setSymbolizer(std::move(Symzer));
130 }
131 RelInfo.release();
132 Symzer.release();
133
134 MIA.reset(target->createMCInstrAnalysis(MII.get()));
135 if (!MIA) {
136 LOG4CXX_ERROR(logger, "no instruction analysis for target " << tripleName);
137 return;
138 }
139
140 int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
141 IP.reset(target->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
142 if (!IP) {
143 LOG4CXX_ERROR(logger, "no instruction printer for target " << tripleName);
144 return;
145 }
146
147 IP->setPrintImmHex(llvm::HexStyle::C);
148 IP->setPrintImmHex(true);
149
150 std::unique_ptr<MCObjectDisassembler> OD(
151 new MCObjectDisassembler(*o, *DisAsm, *MIA));
152 Mod.reset(OD->buildModule(false));
153 }
154
155 template <typename ELFT>
156 void LLVMDisassembler<ELFT>::start() {
157 readSymbols();
158 readSections();
159 disassemble();
160 readDynamicSymbols();
161 }
162
163 template <typename ELFT>
164 LLVMDisassembler<ELFT>::~LLVMDisassembler() {
165 // std::for_each(functions.begin(), functions.end(),
166 // [](std::pair<uint64_t,LLVMFunction*> it) {
167 // delete it.second;
168 // });
169 // std::for_each(blocks.begin(), blocks.end(),
170 // [](std::pair<uint64_t, LLVMBasicBlock*> it) {
171 // delete it.second;
172 // });
173 }
174
175 template <typename ELFT>
176 Function* LLVMDisassembler<ELFT>::disassembleFunctionAt(uint64_t address, const std::string& name) {
177 SectionRef text_section = sections[".text"];
178 uint64_t base_address, size;
179 text_section.getAddress(base_address);
180 text_section.getSize(size);
181
182 if (address < base_address ||
183 address >= base_address + size) {
184 return NULL;
185 }
186
187 if (functions.find(address) != functions.end()) {
188 return functions[address];
189 }
190
191 LLVMFunction * function;
192 if (name == "") {
193 std::stringstream s;
194 s << "<Unnamed 0x" << std::hex << address << ">";
195 function = new LLVMFunction(s.str(), address);
196 } else {
197 function = new LLVMFunction(name, address);
198 }
199 functions.insert(std::make_pair(address, function));
200
201 disassembleFunction(function);
202
203 return function;
204 }
205
206 template <typename ELFT>
207 void LLVMDisassembler<ELFT>::disassembleFunction(LLVMFunction* function) {
208 std::stack<LLVMBasicBlock*> remaining_blocks;
209 SectionRef text_section = sections[".text"];
210 StringRef bytes;
211 text_section.getContents(bytes);
212 StringRefMemoryObject ref(bytes);
213
214 LOG4CXX_DEBUG(logger, "Handling function " << function->getName());
215
216 LLVMBasicBlock * block = new LLVMBasicBlock(function->getStartAddress(), this);
217 remaining_blocks.push(block);
218 blocks.insert(std::make_pair(block->getStartAddress(), block));
219 function->addBasicBlock(block);
220
221 while (remaining_blocks.size()) {
222 LLVMBasicBlock * current_block = remaining_blocks.top();
223 remaining_blocks.pop();
224
225 LOG4CXX_DEBUG(logger, "Handling Block starting at " << std::hex << current_block->getStartAddress());
226
227 uint64_t inst_size;
228 uint64_t base_address;
229 text_section.getAddress(base_address);
230 uint64_t current_address = current_block->getStartAddress() - base_address;
231 while(true) {
232 MCInst inst;
233 std::string buf;
234 llvm::raw_string_ostream s(buf);
235
236 if(llvm::MCDisassembler::Success ==
237 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
238 uint64_t jmptarget;
239
240 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
241 jmptarget += base_address;
242 if (!MIA->isIndirectBranch(inst)) {
243 if (MIA->isCall(inst)) {
244 if (functions.find(jmptarget) == functions.end()) {
245 disassembleFunctionAt(jmptarget);
246 }
247 } else {
248 current_block->setNextBlock(0, jmptarget);
249 if (blocks.find(jmptarget) == blocks.end()) {
250 LLVMBasicBlock * block = new LLVMBasicBlock(jmptarget, this);
251 blocks.insert(std::make_pair(block->getStartAddress(), block));
252 function->addBasicBlock(block);
253 remaining_blocks.push(block);
254 } else {
255 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex << current_block->getStartAddress());
256 function->addBasicBlock(blocks.find(jmptarget)->second);
257 }
258 if (MIA->isConditionalBranch(inst)) {
259 jmptarget = base_address + current_address + inst_size;
260 current_block->setNextBlock(1, jmptarget);
261 if (blocks.find(jmptarget) == blocks.end()) {
262 LLVMBasicBlock * block = new LLVMBasicBlock(jmptarget, this);
263 blocks.insert(std::make_pair(block->getStartAddress(), block));
264 function->addBasicBlock(block);
265 remaining_blocks.push(block);
266 } else {
267 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex << current_block->getStartAddress());
268 function->addBasicBlock(blocks.find(jmptarget)->second);
269 }
270 }
271 }
272 }
273 }
274 } else {
275 inst_size = 0;
276 }
277
278
279 if (inst_size == 0 || MIA->isTerminator(inst) || MIA->isBranch(inst)) {
280 current_block->setEndAddress(current_address + base_address + inst_size);
281 LOG4CXX_DEBUG(logger, "Finished Block at " << std::hex <<
282 current_block->getEndAddress());
283 break;
284 }
285 current_address += inst_size;
286 }
287 }
288 splitBlocks(function);
289 LOG4CXX_DEBUG(logger, "Finished function " << function->getName());
290 manager->signal_new_function(function);
291 }
292
293 template <typename ELFT>
294 void LLVMDisassembler<ELFT>::disassemble() {
295 SectionRef text_section = sections[".text"];
296 std::vector<LLVMFunction*> remaining_functions;
297
298 // Assume all function symbols actually start a real function
299 for (auto x = symbols.begin(); x != symbols.end(); ++x) {
300 uint64_t result;
301 bool contains;
302 SymbolRef::Type symbol_type;
303
304
305 if (text_section.containsSymbol(x->second, contains) || !contains)
306 continue;
307
308 if (x->second.getType(symbol_type)
309 || SymbolRef::ST_Function != symbol_type)
310 continue;
311
312 if (!x->second.getAddress(result)) {
313 LLVMFunction * fun = new LLVMFunction(x->first, result);
314 remaining_functions.push_back(fun);
315 functions.insert(std::make_pair(result, fun));
316 LOG4CXX_DEBUG(logger, "Disasembling " << x->first);
317 }
318 }
319
320 for (LLVMFunction* function : remaining_functions) {
321 disassembleFunction(function);
322 }
323
324 if (binary->isELF()) {
325 const ELFO * elffile = o->getELFFile();
326 const typename ELFO::Elf_Ehdr * header = elffile->getHeader();
327
328 _entryAddress = header->e_entry;
329 LOG4CXX_DEBUG(logger, "Adding entryAddress at: " << std::hex << _entryAddress);
330 std::stringstream s;
331 s << "<_start 0x" << std::hex << _entryAddress << ">";
332
333 disassembleFunctionAt(_entryAddress, s.str());
334 }
335
336 if (functions.empty()) {
337 uint64_t text_entry;
338 text_section.getAddress(text_entry);
339 LOG4CXX_INFO(logger, "No Symbols found, starting at the beginning of the text segment");
340 disassembleFunctionAt(text_entry);
341 }
342 }
343
344 template <typename ELFT>
345 void LLVMDisassembler<ELFT>::splitBlocks(LLVMFunction* function) {
346 SectionRef text_section = sections[".text"];
347 StringRef bytes;
348 text_section.getContents(bytes);
349 StringRefMemoryObject ref(bytes);
350
351 // Split blocks where jumps are going inside the block
352 for (auto it = function->blocks().begin();
353 it != function->blocks().end();
354 ++it) {
355 BasicBlock * current_block = it->second;
356 uint64_t inst_size;
357 uint64_t base_address;
358 text_section.getAddress(base_address);
359 uint64_t current_address = current_block->getStartAddress() - base_address;
360 while(current_block->getEndAddress() - base_address > current_address) {
361 MCInst inst;
362 std::string buf;
363 llvm::raw_string_ostream s(buf);
364
365 if(llvm::MCDisassembler::Success ==
366 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
367 // See if some other block starts here
368 auto other = blocks.find(current_address + inst_size + base_address);
369
370 // Special case, other block starts here but we are at the end anyway
371 if (other != blocks.end()) {
372 uint64_t endaddress = current_address + inst_size + base_address;
373 if (endaddress != current_block->getEndAddress()) {
374 LOG4CXX_DEBUG(logger, "Shortening block starting at "
375 << std::hex
376 << current_block->getStartAddress()
377 << " now ending at "
378 << other->first);
379 function->addBasicBlock(other->second);
380 current_block->setEndAddress(endaddress);
381 current_block->setNextBlock(0, other->first);
382 current_block->setNextBlock(1, 0);
383 }
384 }
385 } else {
386 inst_size = 1;
387 }
388 current_address += inst_size;
389 }
390 }
391 }
392
393 template <typename ELFT>
394 void LLVMDisassembler<ELFT>::readDynamicSymbols() {
395 const ELFO * elffile = o->getELFFile();
396 for (typename ELFO::Elf_Sym_Iter
397 it = elffile->begin_dynamic_symbols(),
398 end = elffile->end_dynamic_symbols();
399 it != end;
400 ++it) {
401 if (it->getType() == 2) { // Function
402 bool is_default;
403 // TODO: Error handling
404 std::string symbolname = *(elffile->getSymbolName(it));
405 std::string symbolversion = *(elffile->getSymbolVersion(nullptr, &*it, is_default));
406 manager->signal_new_dyn_symbol(symbolname + (is_default? "@@" : "@") + symbolversion);
407 LOG4CXX_DEBUG(logger, "Adding dynamic Symbol " << symbolname << (is_default? "@@" : "@") << symbolversion);
408 }
409 }
410 }
411
412 template <typename ELFT>
413 void LLVMDisassembler<ELFT>::readSymbols() {
414 error_code ec;
415 symbol_iterator si(o->symbol_begin()), se(o->symbol_end());
416 for (; si != se; ++si) {
417 StringRef name;
418 if ((ec = si->getName(name))) {
419 LOG4CXX_ERROR(logger, ec.message());
420 break;
421 }
422 LOG4CXX_DEBUG(logger, "Added symbol " << name.str());
423 symbols.insert(make_pair(name.str(), *si));
424 }
425 }
426
427 template <typename ELFT>
428 void LLVMDisassembler<ELFT>::readSections() {
429 error_code ec;
430 section_iterator i(o->section_begin()), e(o->section_end());
431 for (; i != e; ++i) {
432 StringRef name;
433 if ((ec = i->getName(name))) {
434 LOG4CXX_ERROR(logger, ec.message());
435 break;
436 }
437 LOG4CXX_DEBUG(logger, "Added section " << name.str());
438 sections.insert(make_pair(name.str(), *i));
439 }
440
441 }
442
443 template <typename ELFT>
444 void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
445 std::for_each(functions.begin(), functions.end(),
446 [&](std::pair<uint64_t, LLVMFunction*> x) {
447 callback(x.first, x.second);
448 });
449 }
450
451 template <typename ELFT>
452 void LLVMDisassembler<ELFT>::printEachInstruction(uint64_t start, uint64_t end,
453 std::function<void (uint8_t*, size_t,
454 const std::string&,
455 const std::string&)> fun) {
456 SectionRef text_section = sections[".text"];
457 uint64_t base_address;
458 text_section.getAddress(base_address);
459 uint64_t current_address = start - base_address;
460
461 StringRef bytes;
462 text_section.getContents(bytes);
463 StringRefMemoryObject ref(bytes);
464
465 while (current_address < end - base_address) {
466 uint64_t inst_size;
467 MCInst inst;
468 std::string buf;
469 llvm::raw_string_ostream s(buf);
470
471 if(llvm::MCDisassembler::Success ==
472 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
473
474 uint8_t bytes[inst_size+2];
475 ref.readBytes(current_address, inst_size, bytes);
476
477 uint64_t jmptarget;
478 std::string ref("");
479 IP->printInst(&inst, s, "");
480 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
481 std::stringstream stream;
482 if (MIA->isCall(inst))
483 stream << "function:";
484 else
485 stream << "block:";
486
487 stream << std::hex << (base_address + jmptarget);
488 ref = stream.str();
489 }
490
491
492 fun(bytes, inst_size, s.str(), ref);
493 } else {
494 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
495 fun(NULL, 0, "Invalid Byte", "");
496 inst_size = 1;
497 }
498
499 current_address += inst_size;
500 }
501 }