Reoganize Function/BasicBlock creation
[frida/frida.git] / src / disassembler / llvm / LLVMDisassembler.cxx
1 #include "disassembler/llvm/LLVMDisassembler.hxx"
2 #include "core/InformationManager.hxx"
3 #include "core/Function.hxx"
4 #include "core/BasicBlock.hxx"
5
6 #include <stack>
7 #include <algorithm>
8 #include <cassert>
9
10 using namespace llvm;
11 using namespace llvm::object;
12 using std::error_code;
13
14 /*
15 *
16 */
17 Disassembler * createLLVMDisassembler(const std::string& filename, InformationManager* manager) {
18 if (filename == "")
19 return NULL;
20
21 std::unique_ptr<Binary> o;
22 o.reset(createBinary(filename).get());
23 Binary * op = o.release();
24
25 // ELFType<endian, maxalign, 64bit>
26 if (ELF32LEObjectFile * object = dyn_cast<ELF32LEObjectFile>(op)) {
27 return new LLVMDisassembler<ELFType<support::little, 2, false>>(filename, manager, object);
28 }
29 if (ELF64LEObjectFile * object = dyn_cast<ELF64LEObjectFile>(op)) {
30 return new LLVMDisassembler<ELFType<support::little, 2, true>>(filename, manager, object);
31 }
32 if (ELF32BEObjectFile * object = dyn_cast<ELF32BEObjectFile>(op)) {
33 return new LLVMDisassembler<ELFType<support::big, 2, false>>(filename, manager, object);
34 }
35 if (ELF64BEObjectFile * object = dyn_cast<ELF64BEObjectFile>(op)) {
36 return new LLVMDisassembler<ELFType<support::big, 2, true>>(filename, manager, object);
37 }
38
39 return NULL;
40 }
41
42 /*
43 * TODO: fallback code falls die Datei kein ELF/PE/COFF/MacO/.. binary
44 * ist sondern z.B. einfach nur Instruktionen oder ein Bootsektor oder
45 * foo
46 */
47 template <typename ELFT>
48 LLVMDisassembler<ELFT>::LLVMDisassembler(const std::string& filename,
49 InformationManager* manager,
50 ELFObjectFile<ELFT>* file)
51 : Disassembler(filename, manager)
52 , logger(log4cxx::Logger::getLogger("LLVMDisassembler"))
53 , triple("unknown-unknown-unknown")
54 , manager(manager)
55 {
56 LOG4CXX_DEBUG(logger, "Handling file" << filename);
57
58 if (!file) {
59 auto result = createBinary(filename);
60
61 error_code ec;
62 if ((ec = result.getError())) {
63 LOG4CXX_ERROR(logger, "Failed to load Binary" << ec.message());
64 binary = NULL;
65 return;
66 }
67
68 binary.reset(result.get());
69
70 o = dyn_cast<ELFObjectFile<ELFT>>(binary.get());
71 } else {
72 o = file;
73 binary.reset(file);
74 }
75
76 triple.setArch(Triple::ArchType(o->getArch()));
77 std::string tripleName(triple.getTriple());
78
79 LOG4CXX_INFO(logger, "Architecture " << tripleName);
80
81
82 std::string es;
83 target = TargetRegistry::lookupTarget("", triple, es);
84 if (!target) {
85 LOG4CXX_ERROR(logger, es);
86 return;
87 }
88
89 LOG4CXX_INFO(logger, "Target " << target->getName());
90
91 MRI.reset(target->createMCRegInfo(tripleName));
92 if (!MRI) {
93 LOG4CXX_ERROR(logger, "no register info for target " << tripleName);
94 return;
95 }
96
97 // Set up disassembler.
98 AsmInfo.reset(target->createMCAsmInfo(*MRI, tripleName));
99 if (!AsmInfo) {
100 LOG4CXX_ERROR(logger, "no assembly info for target " << tripleName);
101 return;
102 }
103
104 STI.reset(target->createMCSubtargetInfo(tripleName, "", ""));
105 if (!STI) {
106 LOG4CXX_ERROR(logger, "no subtarget info for target " << tripleName);
107 return;
108 }
109
110 MII.reset(target->createMCInstrInfo());
111 if (!MII) {
112 LOG4CXX_ERROR(logger, "no instruction info for target " << tripleName);
113 return;
114 }
115
116 MOFI.reset(new MCObjectFileInfo);
117 MCContext Ctx(AsmInfo.get(), MRI.get(), MOFI.get());
118
119 DisAsm.reset(target->createMCDisassembler(*STI, Ctx));
120 if (!DisAsm) {
121 LOG4CXX_ERROR(logger, "no disassembler for target " << tripleName);
122 return;
123 }
124 RelInfo.reset(
125 target->createMCRelocationInfo(tripleName, Ctx));
126 if (RelInfo) {
127 Symzer.reset(
128 MCObjectSymbolizer::createObjectSymbolizer(Ctx, std::move(RelInfo), o));
129 if (Symzer)
130 DisAsm->setSymbolizer(std::move(Symzer));
131 }
132 RelInfo.release();
133 Symzer.release();
134
135 MIA.reset(target->createMCInstrAnalysis(MII.get()));
136 if (!MIA) {
137 LOG4CXX_ERROR(logger, "no instruction analysis for target " << tripleName);
138 return;
139 }
140
141 int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
142 IP.reset(target->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
143 if (!IP) {
144 LOG4CXX_ERROR(logger, "no instruction printer for target " << tripleName);
145 return;
146 }
147
148 IP->setPrintImmHex(llvm::HexStyle::C);
149 IP->setPrintImmHex(true);
150
151 std::unique_ptr<MCObjectDisassembler> OD(
152 new MCObjectDisassembler(*o, *DisAsm, *MIA));
153 Mod.reset(OD->buildModule(false));
154 }
155
156 template <typename ELFT>
157 void LLVMDisassembler<ELFT>::start() {
158 readSymbols();
159 readSections();
160 disassemble();
161 readDynamicSymbols();
162 }
163
164 template <typename ELFT>
165 LLVMDisassembler<ELFT>::~LLVMDisassembler() {}
166
167 template <typename ELFT>
168 Function* LLVMDisassembler<ELFT>::disassembleFunctionAt(uint64_t address, const std::string& name) {
169 Function * function;
170 SectionRef text_section = sections[".text"];
171 uint64_t base_address, size;
172 text_section.getAddress(base_address);
173 text_section.getSize(size);
174
175 if (address < base_address ||
176 address >= base_address + size) {
177 return NULL;
178 }
179
180 if (NULL == (function = manager->getFunction(address))) {
181
182 if (name == "") {
183 std::stringstream s;
184 s << "<Unnamed 0x" << std::hex << address << ">";
185 function = manager->newFunction(address);
186 function->setName(s.str());
187 } else {
188 function = manager->newFunction(address);
189 function->setName(name);
190 }
191 disassembleFunction(function);
192 manager->finishFunction(function);
193 }
194
195 return function;
196 }
197
198 template <typename ELFT>
199 void LLVMDisassembler<ELFT>::disassembleFunction(Function* function) {
200 std::stack<BasicBlock*> remaining_blocks;
201 /* TODO:
202 * Do all blocks get added properly? We should take care to remove
203 * the other ones at the end of the function!
204 */
205 std::map<uint64_t, BasicBlock*> new_blocks;
206 SectionRef text_section = sections[".text"];
207 StringRef bytes;
208 text_section.getContents(bytes);
209 StringRefMemoryObject ref(bytes);
210
211 LOG4CXX_DEBUG(logger, "Handling function " << function->getName());
212
213 BasicBlock * block = manager->newBasicBlock(function->getStartAddress());
214 remaining_blocks.push(block);
215 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
216 function->addBasicBlock(block);
217
218 while (remaining_blocks.size()) {
219 BasicBlock * current_block = remaining_blocks.top();
220 remaining_blocks.pop();
221
222 LOG4CXX_DEBUG(logger, "Handling Block starting at " << std::hex
223 << current_block->getStartAddress());
224
225 uint64_t inst_size;
226 uint64_t base_address;
227 text_section.getAddress(base_address);
228 uint64_t current_address = current_block->getStartAddress() - base_address;
229 while(true) {
230 MCInst inst;
231 std::string buf;
232 llvm::raw_string_ostream s(buf);
233
234 if(llvm::MCDisassembler::Success ==
235 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
236 uint64_t jmptarget;
237
238 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
239 jmptarget += base_address;
240 if (!MIA->isIndirectBranch(inst)) {
241 if (MIA->isCall(inst)) {
242 if (NULL == manager->getFunction(jmptarget))
243 disassembleFunctionAt(jmptarget);
244 } else {
245 current_block->setNextBlock(0, jmptarget);
246 if (new_blocks.find(jmptarget) == new_blocks.end()) {
247 BasicBlock * block = manager->newBasicBlock(jmptarget);
248 assert(block);
249 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
250 function->addBasicBlock(block);
251 remaining_blocks.push(block);
252 } else {
253 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
254 << current_block->getStartAddress());
255 function->addBasicBlock(new_blocks.find(jmptarget)->second);
256 }
257 if (MIA->isConditionalBranch(inst)) {
258 jmptarget = base_address + current_address + inst_size;
259 current_block->setNextBlock(1, jmptarget);
260 if (new_blocks.find(jmptarget) == new_blocks.end()) {
261 BasicBlock * block = manager->newBasicBlock(jmptarget);
262 assert(block);
263 new_blocks.insert(std::make_pair(block->getStartAddress(), block));
264 function->addBasicBlock(block);
265 remaining_blocks.push(block);
266 } else {
267 LOG4CXX_DEBUG(logger, "Reusing Block starting at " << std::hex
268 << current_block->getStartAddress());
269 function->addBasicBlock(new_blocks.find(jmptarget)->second);
270 }
271 }
272 }
273 }
274 }
275 } else {
276 inst_size = 0;
277 }
278
279
280 if (inst_size == 0 || MIA->isTerminator(inst) || MIA->isBranch(inst)) {
281 current_block->setEndAddress(current_address + base_address + inst_size);
282 LOG4CXX_DEBUG(logger, "Finished Block at " << std::hex <<
283 current_block->getEndAddress());
284 break;
285 }
286 current_address += inst_size;
287 }
288 }
289 splitBlocks(function);
290 LOG4CXX_DEBUG(logger, "Finished function " << function->getName());
291 manager->signal_new_function(function);
292 }
293
294 template <typename ELFT>
295 void LLVMDisassembler<ELFT>::disassemble() {
296 SectionRef text_section = sections[".text"];
297 std::vector<Function*> remaining_functions;
298
299 // Assume all function symbols actually start a real function
300 for (auto x = symbols.begin(); x != symbols.end(); ++x) {
301 uint64_t result;
302 bool contains;
303 SymbolRef::Type symbol_type;
304
305
306 if (text_section.containsSymbol(x->second, contains) || !contains)
307 continue;
308
309 if (x->second.getType(symbol_type)
310 || SymbolRef::ST_Function != symbol_type)
311 continue;
312
313 if (!x->second.getAddress(result)) {
314 Function * fun = manager->newFunction(result);
315 fun->setName(x->first);
316 remaining_functions.push_back(fun);
317 LOG4CXX_DEBUG(logger, "Disasembling " << x->first);
318 }
319 }
320
321 for (Function* function : remaining_functions) {
322 disassembleFunction(function);
323 manager->finishFunction(function);
324 }
325
326 if (binary->isELF()) {
327 const ELFO * elffile = o->getELFFile();
328 const typename ELFO::Elf_Ehdr * header = elffile->getHeader();
329
330 _entryAddress = header->e_entry;
331 LOG4CXX_DEBUG(logger, "Adding entryAddress at: " << std::hex << _entryAddress);
332 std::stringstream s;
333 s << "<_start 0x" << std::hex << _entryAddress << ">";
334
335 disassembleFunctionAt(_entryAddress, s.str());
336 }
337
338 uint64_t text_entry;
339 text_section.getAddress(text_entry);
340 LOG4CXX_INFO(logger, "No Symbols found, starting at the beginning of the text segment");
341 disassembleFunctionAt(text_entry);
342 }
343
344 template <typename ELFT>
345 void LLVMDisassembler<ELFT>::splitBlocks(Function* function) {
346 SectionRef text_section = sections[".text"];
347 StringRef bytes;
348 text_section.getContents(bytes);
349 StringRefMemoryObject ref(bytes);
350
351 // Split blocks where jumps are going inside the block
352 for (auto it = function->blocks().begin();
353 it != function->blocks().end();
354 ++it) {
355 BasicBlock * current_block = it->second;
356 uint64_t inst_size;
357 uint64_t base_address;
358 text_section.getAddress(base_address);
359 uint64_t current_address = current_block->getStartAddress() - base_address;
360 while(current_block->getEndAddress() - base_address > current_address) {
361 MCInst inst;
362 std::string buf;
363 llvm::raw_string_ostream s(buf);
364
365 if(llvm::MCDisassembler::Success ==
366 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
367 // See if some other block starts here
368 BasicBlock* other = manager->getBasicBlock(current_address
369 + inst_size
370 + base_address);
371
372 // Special case, other block starts here but we are at the end anyway
373 if (other != NULL) {
374 uint64_t endaddress = current_address + inst_size + base_address;
375 if (endaddress != current_block->getEndAddress()) {
376 LOG4CXX_DEBUG(logger, "Shortening block starting at "
377 << std::hex
378 << current_block->getStartAddress()
379 << " now ending at "
380 << other->getStartAddress());
381 function->addBasicBlock(other);
382 current_block->setEndAddress(endaddress);
383 current_block->setNextBlock(0, other->getStartAddress());
384 current_block->setNextBlock(1, 0);
385 }
386 }
387 } else {
388 inst_size = 1;
389 }
390 current_address += inst_size;
391 }
392 }
393 }
394
395 template <typename ELFT>
396 void LLVMDisassembler<ELFT>::readDynamicSymbols() {
397 const ELFO * elffile = o->getELFFile();
398 for (typename ELFO::Elf_Sym_Iter
399 it = elffile->begin_dynamic_symbols(),
400 end = elffile->end_dynamic_symbols();
401 it != end;
402 ++it) {
403 if (it->getType() == 2) { // Function
404 bool is_default;
405 // TODO: Error handling
406 std::string symbolname = *(elffile->getSymbolName(it));
407 std::string symbolversion = *(elffile->getSymbolVersion(nullptr, &*it, is_default));
408 manager->signal_new_dyn_symbol(symbolname + (is_default? "@@" : "@") + symbolversion);
409 LOG4CXX_DEBUG(logger, "Adding dynamic Symbol " << symbolname << (is_default? "@@" : "@") << symbolversion);
410 }
411 }
412 }
413
414 template <typename ELFT>
415 void LLVMDisassembler<ELFT>::readSymbols() {
416 error_code ec;
417 symbol_iterator si(o->symbol_begin()), se(o->symbol_end());
418 for (; si != se; ++si) {
419 StringRef name;
420 if ((ec = si->getName(name))) {
421 LOG4CXX_ERROR(logger, ec.message());
422 break;
423 }
424 LOG4CXX_DEBUG(logger, "Added symbol " << name.str());
425 symbols.insert(make_pair(name.str(), *si));
426 }
427 }
428
429 template <typename ELFT>
430 void LLVMDisassembler<ELFT>::readSections() {
431 error_code ec;
432 section_iterator i(o->section_begin()), e(o->section_end());
433 for (; i != e; ++i) {
434 StringRef name;
435 if ((ec = i->getName(name))) {
436 LOG4CXX_ERROR(logger, ec.message());
437 break;
438 }
439 LOG4CXX_DEBUG(logger, "Added section " << name.str());
440 sections.insert(make_pair(name.str(), *i));
441 }
442
443 }
444
445 // template <typename ELFT>
446 // void LLVMDisassembler<ELFT>::forEachFunction(std::function<void (uint64_t, Function*)> callback) {
447 // // std::for_each(functions.begin(), functions.end(),
448 // // [&](std::pair<uint64_t, Function*> x) {
449 // // callback(x.first, x.second);
450 // // });
451 // }
452
453 template <typename ELFT>
454 void LLVMDisassembler<ELFT>::printEachInstruction(uint64_t start, uint64_t end,
455 std::function<void (uint8_t*, size_t,
456 const std::string&,
457 const std::string&)> fun) {
458 SectionRef text_section = sections[".text"];
459 uint64_t base_address;
460 text_section.getAddress(base_address);
461 uint64_t current_address = start - base_address;
462
463 StringRef bytes;
464 text_section.getContents(bytes);
465 StringRefMemoryObject ref(bytes);
466
467 while (current_address < end - base_address) {
468 uint64_t inst_size;
469 MCInst inst;
470 std::string buf;
471 llvm::raw_string_ostream s(buf);
472
473 if(llvm::MCDisassembler::Success ==
474 DisAsm->getInstruction(inst, inst_size, ref, current_address, nulls(), nulls())) {
475
476 uint8_t bytes[inst_size+2];
477 ref.readBytes(current_address, inst_size, bytes);
478
479 uint64_t jmptarget;
480 std::string ref("");
481 IP->printInst(&inst, s, "");
482 if (MIA->evaluateBranch(inst, current_address, inst_size, jmptarget)) {
483 std::stringstream stream;
484 if (MIA->isCall(inst))
485 stream << "function:";
486 else
487 stream << "block:";
488
489 stream << std::hex << (base_address + jmptarget);
490 ref = stream.str();
491 }
492
493
494 fun(bytes, inst_size, s.str(), ref);
495 } else {
496 LOG4CXX_WARN(logger, "Invalid byte at" << std::hex << current_address + base_address);
497 fun(NULL, 0, "Invalid Byte", "");
498 inst_size = 1;
499 }
500
501 current_address += inst_size;
502 }
503 }