plaidCTF 2014 - tiffany (re300)
tiffany
Reversing (300 pts)
-------------------
We want to get access to a server used by The Plague. Maybe if you
can find out what key is accepted by this binary you can find out
where or when The Plague is...
Yay, a Linux x86_64 executable! Let’s run it and see what happens, because what could possibly go wrong when running a random binary off the internet?
$ ./tiffany
This may take a while...
.......
Please enter a string: TEST
....
Sorry, wrong.
Well, that took 3 seconds to initialize and 5 seconds per input string character. Sure seems to be doing a lot of stuff. Let’s load it into IDA to get a general idea.
.text:004017C9 loc_4017C9:
.text:004017C9 mov edi, 1 ; seconds
.text:004017CE call _sleep
.text:004017D3 mov [rsp+148h+last_child_pid], r12d
.text:004017DB mov ebx, cs:children
.text:004017E1 mov edi, 50000 ; useconds
.text:004017E6 call _usleep
.text:004017EB mov ecx, 0
.text:004017F0 mov edx, 0
.text:004017F5 mov esi, ebx
.text:004017F7 mov edi, PTRACE_ATTACH ; request
.text:004017FC mov eax, 0
.text:00401801 call _ptrace
.text:00401806 test rax, rax
.text:00401809 jns short loc_40182B
.text:0040180B mov edi, offset s ; "ptrace error!"
.text:00401810 call _perror
.text:00401815 mov esi, 9 ; sig
.text:0040181A mov edi, ebx ; pid
.text:0040181C call _kill
.text:00401821 mov edi, 1 ; status
.text:00401826 call _exit
.text:0040182B ; -------------------------------------------------------
.text:0040182B
.text:0040182B loc_40182B:
.text:0040182B mov edi, 0 ; stat_loc
.text:00401830 mov eax, 0
.text:00401835 call _wait
.text:0040183A mov ecx, 1
.text:0040183F mov edx, offset ready
.text:00401844 mov esi, ebx
.text:00401846 mov edi, PTRACE_POKEDATA ; request
.text:0040184B mov eax, 0
.text:00401850 call _ptrace
.text:00401855 test rax, rax
.text:00401858 jns short loc_40187A
.text:0040185A mov edi, offset s ; "ptrace error!"
.text:0040185F call _perror
.text:00401864 mov esi, 9 ; sig
.text:00401869 mov edi, ebx ; pid
.text:0040186B call _kill
.text:00401870 mov edi, 1 ; status
.text:00401875 call _exit
Aha, ptrace
. Looks like it forks off some children and then calls ptrace
on
them. Unfortunately, our favorite tool for inspecting system calls, strace
,
itself uses ptrace
(and thus is incompatible with apps that ptrace
themselves):
$ strace -f ./tiffany
[...]
[pid 26131] write(3, "ptrace error!: Operation not per"..., 39 ) = 39
ptrace error!: Operation not permitted
[pid 26131] close(3) = 0
[pid 26131] munmap(0x70ff5d524000, 4096) = 0
[pid 26131] brk(0x3051000) = 0x3051000
[pid 26131] kill(26132, SIGKILL) = 0
[pid 26131] exit_group(1) = ?
[pid 26132] +++ killed by SIGKILL +++
We can use a different approach, though: interposing a wrapper around ptrace
using LD_PRELOAD
.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <signal.h>
#include <dlfcn.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/ptrace.h>
#include <stdarg.h>
pid_t fork(void) {
static pid_t (*fork_real)(void) = NULL;
pid_t ret;
if (!fork_real)
fork_real = dlsym(RTLD_NEXT, "fork");
ret = fork_real();
fprintf(stderr, "[%5d] fork() = %d\n", getpid(), ret);
return ret;
}
int kill(pid_t pid, int sig) {
static int (*kill_real)(pid_t pid, int sig) = NULL;
int ret;
if (!kill_real)
kill_real = dlsym(RTLD_NEXT, "kill");
ret = kill_real(pid, sig);
fprintf(stderr, "[%5d] kill(%d, %d) = %d\n",
getpid(), pid, sig, ret);
return ret;
}
static const char *reqs[] = {
"TRACEME", "PEEKTEXT", "PEEKDATA", "PEEKUSER", "POKETEXT",
"POKEDATA", "POKEUSER", "CONT", "KILL", "SINGLESTEP", "???", "???",
"GETREGS", "SETREGS", "GETFPREGS", "SETFPREGS", "ATTACH",
"DEATTACH", "GETFPXREGS", "SETFPXREGS"
};
long ptrace(enum __ptrace_request request, ...) {
static int (*ptrace_real)(enum __ptrace_request request,
...) = NULL;
long ret;
pid_t pid;
void *addr, *data;
// proto uses ... to be annoying...
va_list ap;
va_start(ap, request);
pid = va_arg(ap, pid_t);
addr = va_arg(ap, void*);
data = va_arg(ap, void*);
va_end(ap);
const char *req = reqs[request];
if (!ptrace_real)
ptrace_real = dlsym(RTLD_NEXT, "ptrace");
ret = ptrace_real(request, pid, addr, data);
fprintf(stderr, "[%5d] ptrace(%s, %d, %p, %p) = %ld\n",
getpid(), req, pid, addr, data, ret);
return ret;
}
$ gcc -shared -ldl preload.c -o preload.so -fPIC
$ LD_PRELOAD=$PWD/preload.so ./tiffany 2>log.txt
Alas, this produced a 1GiB log file of ptrace calls. Well, that wasn’t going to be fun to stare at, so I went back to static analysis for now. This sure explained the slowness, though.
After staring at the disassembly for a while, a general pattern emerged: the
code spawns 7 threads (processes) that communicate with each other using
ptrace
memory pokes and sleeps (using SIGUSR1 to wake up from the sleep).
This is essentially a crude message passing mechanism using ptrace
.
00618180 ready
00618184 child_id
00618188 command
0061818c data_buffer (up to 0x20000 32-bit words)
With that knowledge, we can write a more elaborate version of the preload shim that prints out messages, not raw ptrace calls.
/* snip identical includes */
/* number of data words to buffer and print */
#define DW 16
typedef struct _tinfo {
pid_t pid;
int forked;
int ready;
int command;
int target;
int data[DW];
} tinfo;
static tinfo td[16] = { {0} };
static tinfo *get_tinfo(pid_t pid) {
tinfo *p = td;
while (p->pid != pid && p->pid) p++;
if (!p->pid) {
p->pid = pid;
p->forked = -1;
}
return p;
}
static int fctr = 0;
pid_t fork(void) {
static pid_t (*fork_real)(void) = NULL;
pid_t ret;
if (!fork_real)
fork_real = dlsym(RTLD_NEXT, "fork");
ret = fork_real();
if (ret) {
get_tinfo(ret)->forked = fctr;
fprintf(stderr, "[%5d] PID %d => child %d\n",
getpid(), ret, fctr);
fctr++;
}
return ret;
}
int kill(pid_t pid, int sig) {
static int (*kill_real)(pid_t pid, int sig) = NULL;
int ret;
if (!kill_real)
kill_real = dlsym(RTLD_NEXT, "kill");
if (sig != SIGUSR1) {
fprintf(stderr, "[%5d] kill(%d, %d)...\n", getpid(), pid, sig);
} else {
tinfo *t = get_tinfo(pid);
fprintf(stderr,
"[%5d/%d] TO %d/%d: r=%d cmd=%d target=%d data=[",
getpid(), fctr, t->pid, t->forked, t->ready,
t->command, t->target);
int i;
for (i=0; i<DW; i++) {
fprintf(stderr, "%d ", t->data[i]);
}
fprintf(stderr, "...]\n");
}
ret = kill_real(pid, sig);
if (sig != SIGUSR1)
fprintf(stderr, "[%5d] kill(%d, %d) = %d\n",
getpid(), pid, sig, ret);
return ret;
}
void exit(int status) {
static int (*exit_real)(int status) = NULL;
if (!exit_real)
exit_real = dlsym(RTLD_NEXT, "exit");
fprintf(stderr, "[%5d] exit(%d)\n", getpid(), status);
exit_real(status);
}
long ptrace(enum __ptrace_request request, ...) {
static int (*ptrace_real)(enum __ptrace_request request,
...) = NULL;
long ret;
pid_t pid;
void *addr, *data;
// proto uses ... to be annoying...
va_list ap;
va_start(ap, request);
pid = va_arg(ap, pid_t);
addr = va_arg(ap, void*);
data = va_arg(ap, void*);
va_end(ap);
if (!ptrace_real)
ptrace_real = dlsym(RTLD_NEXT, "ptrace");
tinfo *t = get_tinfo(pid);
long int loc = (long int)addr;
long int val = (long int)data;
if (request == PTRACE_POKEDATA) {
if (loc == 0x618180) {
t->ready = val;
} else if (loc == 0x618184) {
t->target = val;
} else if (loc == 0x618188) {
t->command = val;
} else if (loc >= 0x61818c && loc < (0x61818c+DW*4)) {
t->data[(loc-0x61818c)/4] = val;
}
}
ret = ptrace_real(request, pid, addr, data);
return ret;
}
This produces a much more palatable log that we can actually fit in this blog post:
This may take a while...
[26757] PID 26758 => child 0
[26757] PID 26759 => child 1
[26757] PID 26760 => child 2
[26757] PID 26761 => child 3
[26757] PID 26762 => child 4
[26757] PID 26763 => child 5
[26757] PID 26764 => child 6
[26757/7] TO 26758/0: r=1 cmd=0 target=0 data=[26764 0 0 0 0 0 0 0 ...]
[26757/7] TO 26758/0: r=1 cmd=1 target=0 data=[4 0 1 1 1 1 1 1 ...]
.[26757/7] TO 26759/1: r=1 cmd=1 target=1 data=[34 0 2 2 2 2 2 2 ...]
.[26757/7] TO 26760/2: r=1 cmd=1 target=2 data=[5 0 0 0 0 0 0 0 ...]
.[26757/7] TO 26761/3: r=1 cmd=1 target=3 data=[5 0 2 2 2 2 2 2 ...]
.[26757/7] TO 26762/4: r=1 cmd=1 target=4 data=[18 0 0 0 0 0 0 0 ...]
.[26757/7] TO 26763/5: r=1 cmd=1 target=5 data=[10 0 0 0 0 0 0 0 ...]
.[26757/7] TO 26764/6: r=1 cmd=1 target=6 data=[9 0 0 0 0 0 0 0 ...]
.
Please enter a string: A
.[26757/7] TO 26758/0: r=1 cmd=3 target=0 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=3 target=1 data=[65 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=3 target=2 data=[65 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=3 target=3 data=[65 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=3 target=4 data=[65 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=3 target=5 data=[65 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=3 target=6 data=[65 0 1 1 1 1 1 1 ...]
[26764/6] TO 26757/-1: r=0 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26757/7] TO 26758/0: r=1 cmd=2 target=0 data=[65 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26760/2] TO 26759/1: r=1 cmd=4 target=1 data=[26758 0 1 1 1 1 1 1 ...]
[26759/1] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 1 1 1 1 1 1 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=2 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=4 target=2 data=[26758 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=4 target=2 data=[26758 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=4 target=2 data=[26758 0 1 1 1 1 1 1 ...]
[26761/3] TO 26760/2: r=1 cmd=4 target=2 data=[26758 0 1 1 1 1 1 1 ...]
[26760/2] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=3 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=4 target=3 data=[26758 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=4 target=3 data=[26758 0 1 1 1 1 1 1 ...]
[26762/4] TO 26761/3: r=1 cmd=4 target=3 data=[26758 0 1 1 1 1 1 1 ...]
[26761/3] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=4 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=4 target=4 data=[26758 0 1 1 1 1 1 1 ...]
[26763/5] TO 26762/4: r=1 cmd=4 target=4 data=[26758 0 1 1 1 1 1 1 ...]
[26762/4] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=5 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26763/5: r=1 cmd=4 target=5 data=[26758 0 1 1 1 1 1 1 ...]
[26763/5] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26758/0] TO 26764/-1: r=1 cmd=4 target=6 data=[26758 0 1 1 1 1 1 1 ...]
[26764/6] TO 26758/0: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
[26758/0] TO 26757/-1: r=1 cmd=0 target=0 data=[0 0 0 0 0 0 0 0 ...]
Sorry, wrong.
[26757] kill(26758, 9)...
[26757] kill(26758, 9) = 0
[26757] kill(26759, 9)...
[26757] kill(26759, 9) = 0
[26757] kill(26760, 9)...
[26757] kill(26760, 9) = 0
[26757] kill(26761, 9)...
[26757] kill(26761, 9) = 0
[26757] kill(26762, 9)...
[26757] kill(26762, 9) = 0
[26757] kill(26763, 9)...
[26757] kill(26763, 9) = 0
[26757] kill(26764, 9)...
[26757] kill(26764, 9) = 0
[26757] exit(0)
Note that child index 7 corresponds to the main thread in this log, and -1 is unknown (because whatever process is sending the message hasn’t seen that pid in the shim), but you can cross-reference the PID with the forks at the beginning to find out what the real child index is.
With the log and the disassembly, I started to understand the messaging scheme. The threads are arranged in a loop, with each thread pointing to the previous one (thread 0 wraps to thread 6). Threads can send commands to each other. Commands have a numerical target (0-6) and if addressed to the wrong thread, the thread forwards it to the previous thread. Therefore commands all arrive at the correct thread eventually - this is a ring bus. The chief reason for the slowness is that the buffer is 0x20000 words long (even though most messages aren’t even remotely that large), but the entire buffer is copied, word by word, one ptrace call per word, for every message that is forwarded between threads.
The code roughly does the following:
Start threads
Join message bus ring (point thread6 to thread0 using CMD0)
Load blob of data into each thread (CMD1)
For each char of the passphrase:
Send CMD3 to thread0 with char
thread0 then forwards it to thread1 (via 6->5->4->3->2->1),
then thread1 to thread2... CMD3 eventually arrives at all
threads (terminates at thread6).
Call CMD2 on some thread (didn't bother to reverse this)
What are those blobs passed to CMD1? The data is in multiples of 0x404 bytes (0x101 words) and the first word is the count of 0x101-word blocks. The answer lies in the CMD3 handler:
.text:004011F5 cmd_3 proc near
.text:004011F5 movsxd rax, cs:message_data
.text:004011FC movsxd rdx, cs:state_index
.text:00401203 imul rdx, 101h
.text:0040120A add rdx, rax
.text:0040120D mov rax, cs:state_table
.text:00401214 mov eax, [rax+rdx*4+8]
.text:00401218 mov cs:state_index, eax
Or, in C terms:
state_index = state_table[state_index].next_state[message_data[0]]
Each thread has a state machine, which is an array of states (with a variable
number of states per thread). A state is an array of 256 words containing the
next state to visit for each possible input byte, and a valid
flag. The
valid
flag (always set for the last state) marks it as a valid state after
checking the passphrase, although it is possible to exit that state back into
an incorrect state (used to implement end-of-string checks).
I could’ve dumped the state machines from the ptrace shim, but I already had
those structures identified in IDA, so I just lifted them from there. IDA’s
dup()
construct automatically identifies runs of identical words, and thus
implicitly reveals which character is being checked by a state. Most
state tables consist of a default next state X and a different next state
Y for a particular character, so the contents of the state table are
[X]*character_code + [Y] + [X]*(255-character_code)
. For example, t0,
which has 4 states:
.rodata:00402188 t0_blob dd 4
.rodata:00402188 ; count
.rodata:0040218C dd 0 ; valid
.rodata:0040218C dd 7Bh dup(1), 2, 84h dup(1); data
.rodata:0040218C dd 0 ; valid
.rodata:0040218C dd 100h dup(1) ; data
.rodata:0040218C dd 0 ; valid
.rodata:0040218C dd 7Dh dup(2), 3, 82h dup(2); data
.rodata:0040218C dd 1 ; valid
.rodata:0040218C dd 100h dup(1) ; data
Implements the following state machine:
state 0:
'{' (0x7b) -> state 2
else -> state 1
state 1:
any -> state 1 (dead end)
state 2:
'}' (0x7d) -> state 3
else -> state 2
state 3 (final state, valid=1):
any -> state 1
Or, in other words: a ‘{’, followed by any number of non-‘}’ characters, followed by ‘}’, followed by no more characters (end of string).
The state machines basically each implement a regular expression, and all threads’ state machines must arrive at the correct final state for the password to be valid. These are the equivalent regexps implemented for each thread:
0 ^\{[^}]*\}$
1 ^.{32}$
2 ^[^_]*_[^_]*_[^_]*_
3 ^[^_]my
4 ^[^_]*_synchronization
5 ^[^_]*_[^_]*_skills
6 ^[^_]*_[^_]*_[^_]*_suck
From this, we can put together the flag: {my_synchronization_skills_suck}
.