Click here to Skip to main content
65,938 articles
CodeProject is changing. Read more.
Articles / Languages / Objective-C

x64 API Hooker + Disassembler

5.00/5 (3 votes)
8 Nov 2018CPOL3 min read 16.8K   256  
This article shows you how to implement trampolines on x64, introduces the idea of sync and async hooks, and dynamically added runtime filters and loggers.

Introduction

Let's see what we can do with this tool first. Consider the following program:

C++
#include <Windows.h>
#include <stdio.h>

#pragma comment(lib, "TestDll.lib")

__declspec(dllimport) void export_function1();

__declspec(dllimport) void export_function2();

int main(int argc, char* argv[])
{
    export_function1();    // import from TestDll.dll
    Sleep(1);              // import from kernel32.dll
    getchar();             // import from msvcr120d.dll

    export_function2();    // import from TestDll.dll
    Sleep(1);              // import from kernel32.dll
    getchar();             // import from msvcr120d.dll

    return 0;
}

Let's see how we can step through the calls in sync mode:

Image 1

And in async mode:

Image 2

Log file:

Image 3

So the workflow is:

  1. Specify libraries used by target process, so their exports will be patched (we replace first two bytes of function with infinite loop, so the process will block, this ensures that we won't miss any API call)
  2. Launch target process (or perform some action that will trigger the execution of target process, it doesn't matter how it will start its execution)
  3. Now we can enter commands (to add another hooks, perform sync and async waits, etc.)

Note that we can filter API calls by loading filters. The job of the filter is to inspect the thread context, process memory, etc. and to return true (if we are interested in this call) or false (otherwise). Also in sync mode, we can stop on some call, attach debugger, do some stuff, detach debugger and continue like nothing happened. Another possibility is to add custom loggers that will dump register values and memory content to the file.

Using the Code

Now let's turn our attention to the code. I will inspect key moment's here, you can always download the sample to see the full code. Let's start with structure definitions:

C++
struct FUNCTION_CONTEXT
{
    DWORD64 Rip;
    DWORD64 Rsp;
    DWORD64 Rcx;
    DWORD64 Rdx;
    DWORD64 R8;
    DWORD64 R9;
};

struct LIBRARY_ITEM;
struct FUNCTION_ITEM;
typedef BOOL(*TAux)(FUNCTION_ITEM *func, HANDLE hProcess, FUNCTION_CONTEXT *context);

struct FUNCTION_ITEM
{
    LIBRARY_ITEM *lib;
    char *Name;
    UCHAR Bytes[BYTES_SIZE];
    UCHAR *SyncTrampoline;
    UCHAR SyncHook[BYTES_SIZE];
    UCHAR *AsyncTrampoline;
    UCHAR AsyncHook[BYTES_SIZE];
    DWORD Offset;
    DWORD RVA;
    DWORD64 Rip;
    HMODULE LibFilter;
    TAux ProcFilter;
    HMODULE LibLogger;
    TAux ProcLogger;
    DWORD Mode;
};

struct LIBRARY_ITEM
{
    char *LibName;
    char *FileName;
    UCHAR *Base;
    DWORD Count;
    FUNCTION_ITEM *item;
};

FUNCTION_CONTEXT structure holds register values in the moment of hook. LIBRARY_ITEM structure holds information about module that holds functions to be hooked. FUNCTION_ITEM structure holds information about hooked function. As you can see, we will use trampolines to implement API hooking, this technique is thread-safe and also allows us to hook "free" functions (not only "APIs" that are typically called through pointer in IAT).

Now let's see the code to patch library, it is executed before we enter target process id:

C++
LIBRARY_ITEM* AddLibrary(char *FileName, char *LibName)
{
    FILE *file;
    DWORD size;
    UCHAR *Image;
    IMAGE_NT_HEADERS *Headers;
    IMAGE_SECTION_HEADER *Sections;
    DWORD ExportsRVA;
    DWORD ExportsOffset;
    IMAGE_EXPORT_DIRECTORY *Exports;
    DWORD AddressOfFunctionsOffset;
    DWORD AddressOfNamesOffset;
    DWORD *AddressOfFunctions;
    DWORD *AddressOfNames;
    DWORD FunctionOffset;
    DWORD NameOffset;
    UCHAR *Function;
    char *Name;
    FUNCTION_ITEM *item;
    LIBRARY_ITEM *lib;
    DWORD Count;
    DWORD RVA;
    UCHAR SyncHook[SYNC_HOOK_SIZE];

    GenerateSyncHook(NULL, SyncHook);

    file = fopen(FileName, "rb");
    fseek(file, 0, SEEK_END);
    size = ftell(file);

    Image = (UCHAR*)malloc(size);
    fseek(file, 0, SEEK_SET);
    fread(Image, size, 1, file);
    fclose(file);

    Headers = (IMAGE_NT_HEADERS64*)(Image + ((IMAGE_DOS_HEADER*)Image)->e_lfanew);
    Sections = (IMAGE_SECTION_HEADER*)((UCHAR*)Headers + 
     (offsetof(IMAGE_NT_HEADERS, OptionalHeader) + Headers->FileHeader.SizeOfOptionalHeader));
    
    ExportsRVA = Headers->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    ExportsOffset = TranslateRVAToOffset(Sections, ExportsRVA);
    Exports = (IMAGE_EXPORT_DIRECTORY*)(Image + ExportsOffset);

    if (Exports->NumberOfNames != Exports->NumberOfFunctions) DbgRaiseAssertionFailure();

    AddressOfFunctionsOffset = TranslateRVAToOffset(Sections, Exports->AddressOfFunctions);
    AddressOfNamesOffset = TranslateRVAToOffset(Sections, Exports->AddressOfNames);

    AddressOfFunctions = (DWORD*)(Image + AddressOfFunctionsOffset);
    AddressOfNames = (DWORD*)(Image + AddressOfNamesOffset);

    Count = Exports->NumberOfNames;
    item = (FUNCTION_ITEM*)malloc(sizeof(FUNCTION_ITEM) * Count);

    lib = (LIBRARY_ITEM*)malloc(sizeof(LIBRARY_ITEM));

    lib->Count = Count;
    lib->Base = NULL;
    lib->LibName = StringCopy(LibName);
    lib->item = item;
    lib->FileName = StringCopy(FileName);

    for (DWORD i = 0; i < Count; ++i)
    {
        FunctionOffset = TranslateRVAToOffset(Sections, AddressOfFunctions[i]);
        NameOffset = TranslateRVAToOffset(Sections, AddressOfNames[i]);

        Function = Image + FunctionOffset;
        Name = (char*)(Image + NameOffset);

        RVA = GetTargetRVAFromStub(Function, AddressOfFunctions[i]);

        if (RVA != AddressOfFunctions[i])
        {
            FunctionOffset = TranslateRVAToOffset(Sections, RVA);
            Function = Image + FunctionOffset;
        }

        if (memcmp(Function, SyncHook, SYNC_HOOK_SIZE))
        {
            item[i].Name = StringCopy(Name);
            item[i].RVA = RVA;
            item[i].Rip = 0;
            item[i].Offset = FunctionOffset;
            item[i].LibFilter = NULL;
            item[i].ProcFilter = NULL;
            item[i].LibLogger = NULL;
            item[i].ProcLogger = NULL;
            item[i].SyncTrampoline = NULL;
            item[i].AsyncTrampoline = NULL;
            item[i].Mode = MODE_SYNC_HOOK;
            item[i].lib = lib;

            memcpy(item[i].Bytes, Function, BYTES_SIZE);
            memcpy(Function, SyncHook, SYNC_HOOK_SIZE);
        }
        else
        {
            memset(&item[i], 0, sizeof(item[i]));
        }
    }

    file = fopen(FileName, "wb");
    fwrite(Image, size, 1, file);
    fclose(file);
    free(Image);

    return lib;
}

As you can see, we patch library file, build LIBRARY_ITEM structure that holds FUNCTION_ITEM structure for each patched function.

Now let's consider the process of hooking.

Sync hook:

  1. the process blocks (first few bytes of function are replaced by infinite loop)
  2. we enumerate all threads of target process, find the corresponding FUNCTION_ITEM structure
  3. now we can debug the called function with debugger or just skip it (change thread instruction pointer, so it will point to sync trampoline that we previously generated)

Async hook:

  1. first few bytes of function are replaced by instruction that fetches its own instruction pointer value and jump to the function exported by special DLL injected in the target process)
  2. this exported function collects the register values and passes this information to us (using sockets)
  3. we find corresponding FUNCTION_ITEM structure, log the function call and send the address of async trampoline back to target process (using sockets)
  4. exported function receives the address of async trampoline and jumps to this address

Let's see the function to generate trampoline:

C++
void GenerateTrampoline(HANDLE hProcess, DWORD64 Rip, 
     DWORD Mode, UCHAR *Bytes, UCHAR *TrampBytes, DWORD *TrampLen)
{
    DWORD i = 0;
    DWORD j = 0;
    DWORD64 Address;
    Buffer code_buf;
    Instruction inst;
    INT32 Offset;
    UCHAR Rex;
    UCHAR Opcode;
    UCHAR Modrm;
    DWORD HookLen;
    DWORD AddrReg;
    DWORD Reg;

    if (Mode == MODE_SYNC_HOOK) HookLen = SYNC_HOOK_SIZE;
    else if (Mode == MODE_ASYNC_HOOK) HookLen = ASYNC_HOOK_SIZE;
    else DbgRaiseAssertionFailure();

    while (TRUE)
    {
        if (Bytes[i] == 0xC3)            // ret
        {
            TrampBytes[j] = Bytes[i];
            ++j;
            break;
        }
        
        if (i >= HookLen)
        {
            Address = Rip + i;
            GenerateDirectJump(Address, &TrampBytes[j]);
            j += DIRECT_JUMP_SIZE;
            break;
        }

        if (Bytes[i] == 0xEB)            // jmp rel8
        {
            ++i;
            Offset = Bytes[i];
            ++i;
            Address = Rip + i + Offset;
            GenerateDirectJump(Address, &TrampBytes[j]);
            j += DIRECT_JUMP_SIZE;
            break;
        }

        if (Bytes[i] == 0xE9)            // jmp rel32
        {
            ++i;
            Offset = *((INT32*)&Bytes[i]);
            i += sizeof(Offset);
            Address = Rip + i + Offset;
            GenerateDirectJump(Address, &TrampBytes[j]);
            j += DIRECT_JUMP_SIZE;
            break;
        }
        
        if (Bytes[i] == 0xE8)            // call rel32
        {
            ++i;
            Offset = *((INT32*)&Bytes[i]);
            i += sizeof(Offset);
            Address = Rip + i + Offset;
            GenerateDirectCall(Address, &TrampBytes[j]);
            j += DIRECT_CALL_SIZE;
            continue;
        }

        if ((Bytes[i] >= 0x40) && (Bytes[i] <= 0x4F))
        {
            ++i;

            if ((Bytes[i] == 0xFF) && (Bytes[i + 1] == 0x25))     // jmp m64 (rip relative)
            {
                i += 2;
                Offset = *((INT32*)&Bytes[i]);
                i += sizeof(Offset);
                ReadProcessMemoryEx(hProcess, Rip + i + Offset, (UCHAR*)&Address, sizeof(Address));
                GenerateDirectJump(Address, &TrampBytes[j]);
                j += DIRECT_JUMP_SIZE;
                break;
            }

            if ((Bytes[i] == 0xFF) && (Bytes[i + 1] == 0x15))     // call m64 (rip relative)
            {
                i += 2;
                Offset = *((INT32*)&Bytes[i]);
                i += sizeof(Offset);
                ReadProcessMemoryEx(hProcess, Rip + i + Offset, (UCHAR*)&Address, sizeof(Address));
                GenerateDirectCall(Address, &TrampBytes[j]);
                j += DIRECT_CALL_SIZE;
                continue;
            }
            
            --i;
        }

        c_MakeBuffer(Bytes, BYTES_SIZE - i, (Encoding)0, &code_buf);

        inst_set_params(&inst, MODE_64, C_TRUE, &code_buf, NULL, 
                           SHOW_ADDRESS | SHOW_LOWER | SHOW_PSEUDO);

        if (!decode(&inst)) DbgRaiseAssertionFailure();

        if (inst.rip)
        {
            if ((Bytes[i] >= 0x40) && (Bytes[i] <= 0x4F))
            {
                Rex = Bytes[i];
                ++i;
            }
            else Rex = 0;

            Opcode = Bytes[i];
            ++i;

            Modrm = Bytes[i];
            ++i;

            Offset = *((INT32*)&Bytes[i]);
            i += sizeof(Offset);

            Address = Rip + i + Offset;

            Reg = (Modrm & 0x38) >> 3;
            AddrReg = (Reg) ? (0) : (1);

            GenerateRegisterOverride(AddrReg, Address, &TrampBytes[j]);
            j += REGISTER_OVERRIDE_SIZE;

            if (Rex)
            {
                TrampBytes[j] = Rex;
                ++j;
            }

            TrampBytes[j] = Opcode;
            ++j;

            TrampBytes[j] = AddrReg | (Reg << 3);
            ++j;

            GenerateRegisterRestore(AddrReg, &TrampBytes[j]);
            j += REGISTER_RESTORE_SIZE;
        }
        else
        {
            memcpy(&TrampBytes[j], &Bytes[i], code_buf.i);

            i += code_buf.i;
            j += code_buf.i;
        }
    }

    *TrampLen = j;
}

I use my own disassembler, it is included in the sample. However, you can replace it with another one, I just needed a fast way to find out whether instruction uses rip relative addressing.

Now functions to generate hooks:

C++
void GenerateSyncHook(UCHAR *Bytes, UCHAR *HookBytes)
{
    GenerateRelative8Jump(-(SYNC_HOOK_SIZE), HookBytes);

    if (Bytes) memcpy(HookBytes + SYNC_HOOK_SIZE, Bytes + SYNC_HOOK_SIZE, BYTES_SIZE - SYNC_HOOK_SIZE);
}

void GenerateAsyncHook(UCHAR *Base, DWORD64 Rip, UCHAR *Bytes, UCHAR *HookBytes)
{
    INT32 Offset;

    GenerateRelativeCall(0, HookBytes);
    Offset = (DWORD64)(Base + sizeof(IMAGE_DOS_HEADER)) - (Rip + ASYNC_HOOK_SIZE);
    GenerateIndirectJump(Offset, HookBytes + RELATIVE_CALL_SIZE);

    if (Bytes) memcpy(HookBytes + ASYNC_HOOK_SIZE, Bytes + 
                      ASYNC_HOOK_SIZE, BYTES_SIZE - ASYNC_HOOK_SIZE);
}

Init functions:

C++
void InitSync(FUNCTION_ITEM *func, HANDLE hProcess)
{
    DWORD TrampLen;
    UCHAR TrampBytes[MAX_TRAMPOLINE_SIZE];

    if (!func->SyncTrampoline)
    {
        GenerateTrampoline(hProcess, func->Rip, MODE_SYNC_HOOK, func->Bytes, TrampBytes, &TrampLen);
        func->SyncTrampoline = (UCHAR*)VirtualAllocEx
                               (hProcess, NULL, TrampLen, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
        WriteProcessMemoryEx(hProcess, (DWORD64)func->SyncTrampoline, TrampBytes, TrampLen);
        GenerateSyncHook(func->Bytes, func->SyncHook);
    }
}

void InitSync(LIBRARY_ITEM *lib, DWORD FilterMode, HANDLE hProcess)
{
    for (DWORD i = 0; i < lib->Count; ++i)
    {
        if (lib->item[i].Name)
        {
            if (lib->item[i].Mode == FilterMode)
            {
                InitSync(&lib->item[i], hProcess);
            }
        }
    }
}

void InitAsync(FUNCTION_ITEM *func, HANDLE hProcess)
{
    DWORD TrampLen;
    UCHAR TrampBytes[MAX_TRAMPOLINE_SIZE];

    if (!func->AsyncTrampoline)
    {
        GenerateTrampoline(hProcess, func->Rip, MODE_ASYNC_HOOK, func->Bytes, TrampBytes, &TrampLen);
        func->AsyncTrampoline = 
           (UCHAR*)VirtualAllocEx(hProcess, NULL, TrampLen, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
        WriteProcessMemoryEx(hProcess, (DWORD64)func->AsyncTrampoline, TrampBytes, TrampLen);
        GenerateAsyncHook(func->lib->Base, func->Rip, func->Bytes, func->AsyncHook);
    }
}

void InitAsync(LIBRARY_ITEM *lib, DWORD FilterMode, HANDLE hProcess, UCHAR *pLog)
{
    WriteProcessMemoryEx(hProcess, 
           (DWORD64)(lib->Base + sizeof(IMAGE_DOS_HEADER)), (UCHAR*)&pLog, sizeof(pLog));

    for (DWORD i = 0; i < lib->Count; ++i)
    {
        if (lib->item[i].Name)
        {
            if (lib->item[i].Mode == FilterMode)
            {
                InitAsync(&lib->item[i], hProcess);
            }
        }
    }
}

Functions to add export, "free" function, and object:

C++
LIBRARY_ITEM* AddExport(char *LibName, char *ProcName, HANDLE hProcess)
{
    HMODULE hModule;
    LIBRARY_ITEM *lib;
    UCHAR *Rip;

    lib = NULL;
    hModule = GetModuleHandleRemote(hProcess, LibName);

    if (hModule)
    {
        Rip = GetProcAddressRemote(hProcess, hModule, ProcName);

        if (Rip)
        {
            lib = (LIBRARY_ITEM*)malloc(sizeof(LIBRARY_ITEM));
            lib->Base = (UCHAR*)hModule;
            lib->Count = 1;
            lib->FileName = NULL;
            lib->LibName = StringCopy(LibName);
            lib->item = (FUNCTION_ITEM*)malloc(sizeof(FUNCTION_ITEM));
            lib->item->Name = StringCopy(ProcName);
            lib->item->lib = lib;
            lib->item->Offset = 0;
            lib->item->Rip = (DWORD64)Rip;
            lib->item->RVA = 0;
            ReadProcessMemoryEx(hProcess, (DWORD64)Rip, lib->item->Bytes, BYTES_SIZE);
            lib->item->LibFilter = NULL;
            lib->item->ProcFilter = NULL;
            lib->item->LibLogger = NULL;
            lib->item->ProcLogger = NULL;
            lib->item->SyncTrampoline = NULL;
            lib->item->AsyncTrampoline = NULL;
            InitSync(lib->item, hProcess);
            SetHook(lib->item, hProcess, MODE_SYNC_HOOK);
        }
    }

    return lib;
}

LIBRARY_ITEM* AddFunction(char *LibName, char *FunctionName, DWORD RVA, HANDLE hProcess)
{
    HMODULE Module;
    LIBRARY_ITEM *lib;

    lib = NULL;
    Module = GetModuleHandleRemote(hProcess, LibName);

    if (Module)
    {
        lib = (LIBRARY_ITEM*)malloc(sizeof(LIBRARY_ITEM));
        lib->Base = (UCHAR*)Module;
        lib->Count = 1;
        lib->FileName = NULL;
        lib->LibName = StringCopy(LibName);
        lib->item = (FUNCTION_ITEM*)malloc(sizeof(FUNCTION_ITEM));
        lib->item->Rip = (DWORD64)Module + RVA;
        ReadProcessMemoryEx(hProcess, lib->item->Rip, lib->item->Bytes, BYTES_SIZE);
        lib->item->Name = StringCopy(FunctionName);
        lib->item->LibFilter = NULL;
        lib->item->ProcFilter = NULL;
        lib->item->LibLogger = NULL;
        lib->item->ProcLogger = NULL;
        lib->item->SyncTrampoline = NULL;
        lib->item->AsyncTrampoline = NULL;
        lib->item->Offset = 0;
        lib->item->RVA = 0;
        lib->item->lib = lib;
        InitSync(lib->item, hProcess);
        SetHook(&lib->item[0], hProcess, MODE_SYNC_HOOK);
    }

    return lib;
}

LIBRARY_ITEM* AddObject(char *LibName, char *ObjectName, DWORD RVA, DWORD Count, HANDLE hProcess)
{
    LIBRARY_ITEM *lib;
    HMODULE Module;
    UCHAR **Vtbl;
    char Number[10];

    lib = NULL;
    Module = GetModuleHandleRemote(hProcess, LibName);

    if (Module)
    {
        lib = (LIBRARY_ITEM*)malloc(sizeof(LIBRARY_ITEM));
        lib->Base = (UCHAR*)Module;
        lib->LibName = StringCopy(LibName);
        lib->FileName = NULL;
        lib->Count = Count;
        lib->item = (FUNCTION_ITEM*)malloc(sizeof(FUNCTION_ITEM) * Count);
        Vtbl = (UCHAR**)malloc(sizeof(UCHAR*) * Count);
        ReadProcessMemoryEx(hProcess, (DWORD64)((UCHAR*)Module + RVA), 
                           (UCHAR*)Vtbl, sizeof(UCHAR*) * Count);

        for (DWORD i = 0; i < Count; ++i)
        {
            lib->item[i].Rip = (DWORD64)Vtbl[i];
            ReadProcessMemoryEx(hProcess, (DWORD64)Vtbl[i], lib->item[i].Bytes, BYTES_SIZE);
            lib->item[i].Name = (char*)malloc(strlen(ObjectName) + 5);
            strcpy(lib->item[i].Name, ObjectName);
            strcat(lib->item[i].Name, "::");
            itoa(i, Number, 10);
            strcat(lib->item[i].Name, Number);
            lib->item[i].LibFilter = NULL;
            lib->item[i].ProcFilter = NULL;
            lib->item[i].LibLogger = NULL;
            lib->item[i].ProcLogger = NULL;
            lib->item[i].SyncTrampoline = NULL;
            lib->item[i].AsyncTrampoline = NULL;
            lib->item[i].Offset = 0;
            lib->item[i].lib = lib;
            InitSync(&lib->item[i], hProcess);
            SetHook(&lib->item[i], hProcess, MODE_SYNC_HOOK);
        }

        free(Vtbl);
    }

    return lib;
}

Functions to set hook:

C++
void SetHook(FUNCTION_ITEM *func, HANDLE hProcess, DWORD Mode)
{
    if (Mode == MODE_NO_HOOK)
    {
        WriteProcessMemoryEx(hProcess, func->Rip, func->Bytes, BYTES_SIZE);
    }
    else if (Mode == MODE_SYNC_HOOK)
    {
        WriteProcessMemoryEx(hProcess, func->Rip, func->SyncHook, BYTES_SIZE);
    }
    else if (Mode == MODE_ASYNC_HOOK)
    {
        WriteProcessMemoryEx(hProcess, func->Rip, func->AsyncHook, BYTES_SIZE);
    }
    else DbgRaiseAssertionFailure();

    func->Mode = Mode;
}

void SetHook(LIBRARY_ITEM *lib, DWORD FilterMode, HANDLE hProcess, DWORD Mode)
{
    for (DWORD i = 0; i < lib->Count; ++i)
    {
        if (lib->item[i].Name)
        {
            if (lib->item[i].Mode == FilterMode)
            {
                SetHook(&lib->item[i], hProcess, Mode);
            }
        }
    }
}

Function to check whether library is loaded by target process:

C++
BOOL IsLibraryLoaded(LIBRARY_ITEM *lib, HANDLE hProcess)
{
    HMODULE hModule;

    hModule = GetModuleHandleRemote(hProcess, lib->LibName);

    if (hModule)
    {
        if (!lib->Base)
        {
            lib->Base = (UCHAR*)hModule;

            for (DWORD i = 0; i < lib->Count; ++i)
            {
                if (lib->item[i].Name)
                {
                    lib->item[i].Rip = (DWORD64)(lib->Base + lib->item[i].RVA);
                }
            }
        }
        else
        {
            if (lib->Base != (UCHAR*)hModule) DbgRaiseAssertionFailure();
        }

        return TRUE;
    }
    else
    {
        lib->Base = NULL;
        return FALSE;
    }
}

Function to enumerate threads of target process and find the block:

C++
BOOL GetSyncRip(HANDLE hProcess, FUNCTION_CONTEXT *context, HANDLE *phThread)
{
    DWORD TID;
    DWORD PID;
    HANDLE h;
    BOOL b;
    HANDLE hThread;
    THREADENTRY32 te;
    CONTEXT lcContext;
    UCHAR Buffer[SYNC_HOOK_SIZE];
    UCHAR SyncHook[SYNC_HOOK_SIZE];

    GenerateSyncHook(NULL, SyncHook);

    b = FALSE;

    TID = 0;
    PID = GetProcessId(hProcess);

    h = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);

    if (h != INVALID_HANDLE_VALUE)
    {
        te.dwSize = sizeof(te);

        if (Thread32First(h, &te))
        {
            do
            {
                if (te.dwSize >= (FIELD_OFFSET(THREADENTRY32, th32OwnerProcessID) + 
                                  sizeof(te.th32OwnerProcessID)))
                {
                    if (PID == te.th32OwnerProcessID)
                    {
                        TID = te.th32ThreadID;

                        hThread = OpenThread(PROCESS_ALL_ACCESS, FALSE, TID);
                        if (!hThread) DbgRaiseAssertionFailure();

                        lcContext.ContextFlags = CONTEXT_ALL;
                        if (!GetThreadContext(hThread, &lcContext)) DbgRaiseAssertionFailure();

                        ReadProcessMemoryEx(hProcess, lcContext.Rip, Buffer, sizeof(Buffer));

                        if (!memcmp(Buffer, SyncHook, SYNC_HOOK_SIZE))
                        {
                            context->Rip = lcContext.Rip;
                            context->Rsp = lcContext.Rsp;
                            context->Rcx = lcContext.Rcx;
                            context->Rdx = lcContext.Rdx;
                            context->R8 = lcContext.R8;
                            context->R9 = lcContext.R9;

                            *phThread = hThread;
                            b = TRUE;
                            break;
                        }
                        else CloseHandle(hThread);
                    }
                }

                te.dwSize = sizeof(te);
            }
            while (Thread32Next(h, &te));
        }

        CloseHandle(h);
    }

    return b;
}

Code that implements sync and async waits runs in the separate thread.

Thread function for sync wait:

C++
DWORD WaitSyncThreadRoutine(SYNC_WAIT_THREAD_ARGS *args)
{
    BOOL b;
    HANDLE hThread;
    FUNCTION_ITEM *func;
    FUNCTION_CONTEXT context;

    std::list<LIBRARY_ITEM*>::iterator lib_i;

    while (TRUE)
    {
        b = FALSE;

        while (!args->Exit)
        {
            Sleep(100);

            b = GetSyncRip(args->hProcess, &context, &hThread);

            if (b) break;
        }

        if (b)
        {
            func = NULL;

            for (lib_i = g_slib.begin(); lib_i != g_slib.end(); ++lib_i)
            {
                if (IsLibraryLoaded((*lib_i), args->hProcess))
                {
                    func = LookupItemByRip((*lib_i), context.Rip);
                    if (func) break;
                }
            }

            if (func) goto found;

            for (lib_i = g_lib.begin(); lib_i != g_lib.end(); ++lib_i)
            {
                func = LookupItemByRip((*lib_i), context.Rip);
                if (func) break;
            }

        found:

            if (func)
            {
                if ((!func->ProcFilter) || (func->ProcFilter(func, args->hProcess, &context)))
                {
                    memcpy(&args->context, &context, sizeof(context));
                    args->hThread = hThread;
                    args->func = func;
                    printf("Found\n");
                    break;
                }
                else
                {
                    InitSync(func, args->hProcess);
                    SkipItem(func, hThread);
                    CloseHandle(hThread);
                }
            }
            else
            {
                CloseHandle(hThread);
                DbgRaiseAssertionFailure();
            }
        }
    }

    return 0;
}

Thread function for async wait:

C++
DWORD WaitAsyncThreadRoutine(ASYNC_WAIT_THREAD_ARGS *args)
{
    fd_set set;
    BOOL restored;
    timeval timeout;
    FUNCTION_ITEM *func;
    FUNCTION_CONTEXT context;
    std::list<LIBRARY_ITEM*>::iterator lib_i;

    restored = FALSE;

    for (lib_i = g_lib.begin(); lib_i != g_lib.end(); ++lib_i)
    {
        SetHook((*lib_i), MODE_SYNC_HOOK, args->hProcess, MODE_ASYNC_HOOK);
    }

    for (lib_i = g_slib.begin(); lib_i != g_slib.end(); ++lib_i)
    {
        if (IsLibraryLoaded((*lib_i), args->hProcess))
        {
            SetHook((*lib_i), MODE_SYNC_HOOK, args->hProcess, MODE_ASYNC_HOOK);
        }
    }

    while (TRUE)
    {
        while (TRUE)
        {
            if (args->Exit)
            {
                if (!restored)
                {
                    for (lib_i = g_lib.begin(); lib_i != g_lib.end(); ++lib_i)
                    {
                        SetHook((*lib_i), MODE_ASYNC_HOOK, args->hProcess, MODE_SYNC_HOOK);
                    }

                    for (lib_i = g_slib.begin(); lib_i != g_slib.end(); ++lib_i)
                    {
                        if (IsLibraryLoaded((*lib_i), args->hProcess))
                        {
                            SetHook((*lib_i), MODE_ASYNC_HOOK, args->hProcess, MODE_SYNC_HOOK);
                        }
                    }

                    restored = TRUE;
                }
            }

            set.fd_count = 1;
            set.fd_array[0] = args->Socket;

            timeout.tv_sec = 5;
            timeout.tv_usec = 0;

            if (select(0, &set, NULL, NULL, &timeout)) break;

            if (args->Exit) goto end;
        }

        if (recv(args->Socket, (char*)&context, sizeof(context), 0) == SOCKET_ERROR) goto end;

        func = NULL;

        for (lib_i = g_slib.begin(); lib_i != g_slib.end(); ++lib_i)
        {
            if (IsLibraryLoaded((*lib_i), args->hProcess))
            {
                func = LookupItemByRip((*lib_i), context.Rip);
                if (func) break;
            }
        }

        if (func) goto found;

        for (lib_i = g_lib.begin(); lib_i != g_lib.end(); ++lib_i)
        {
            func = LookupItemByRip((*lib_i), context.Rip);
            if (func) break;
        }

    found:

        if (func)
        {
            if ((!func->ProcFilter) || (func->ProcFilter(func, args->hProcess, &context)))
            {
                if (!func->ProcLogger) LogItem(func, args->hProcess, &context);
                else func->ProcLogger(func, args->hProcess, &context);
            }

            send(args->Socket, (char*)&func->AsyncTrampoline, sizeof(func->AsyncTrampoline), 0);
        }
        else DbgRaiseAssertionFailure();
    }

end:

    return 0;
}

Functions to inject code in the target process:

C++
HMODULE LoadLibraryRemote(HANDLE hProcess, char *pLibName)
{
    HANDLE hThread;
    void *Proc, *Data;
    LOAD_LIBRARY_THREAD_ARGS thread_args;

    thread_args.pLoadLibraryA = LoadLibraryA;
    thread_args.Module = NULL;
    strcpy(thread_args.LibName, pLibName);

    Proc = VirtualAllocEx(hProcess, NULL, LOAD_LIBRARY_THREAD_ROUTINE_SIZE, 
                          MEM_COMMIT, PAGE_EXECUTE_READWRITE);

    Data = VirtualAllocEx(hProcess, NULL, sizeof(thread_args), MEM_COMMIT, PAGE_READWRITE);

    // disable incremental linking to make it work
    WriteProcessMemoryEx(hProcess, (DWORD64)Proc, 
             (UCHAR*)LoadLibraryThreadRoutine, LOAD_LIBRARY_THREAD_ROUTINE_SIZE);

    WriteProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)Proc, Data, 0, NULL);

    WaitForSingleObject(hThread, INFINITE);

    CloseHandle(hThread);

    ReadProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    VirtualFreeEx(hProcess, Proc, 0, MEM_RELEASE);

    VirtualFreeEx(hProcess, Data, 0, MEM_RELEASE);

    return thread_args.Module;
}

HMODULE GetModuleHandleRemote(HANDLE hProcess, char *pLibName)
{
    HANDLE hThread;
    void *Proc, *Data;
    GET_MODULE_HANDLE_THREAD_ARGS thread_args;

    thread_args.pGetModuleHandleA = GetModuleHandleA;
    thread_args.Module = NULL;
    strcpy(thread_args.LibName, pLibName);

    Proc = VirtualAllocEx(hProcess, NULL, GET_MODULE_HANDLE_THREAD_ROUTINE_SIZE, 
                          MEM_COMMIT, PAGE_EXECUTE_READWRITE);

    Data = VirtualAllocEx(hProcess, NULL, sizeof(thread_args), MEM_COMMIT, PAGE_READWRITE);

    // disable incremental linking to make it work
    WriteProcessMemoryEx(hProcess, (DWORD64)Proc, 
    (UCHAR*)GetModuleHandleThreadRoutine, GET_MODULE_HANDLE_THREAD_ROUTINE_SIZE);

    WriteProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)Proc, Data, 0, NULL);

    WaitForSingleObject(hThread, INFINITE);

    CloseHandle(hThread);

    ReadProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    VirtualFreeEx(hProcess, Proc, 0, MEM_RELEASE);

    VirtualFreeEx(hProcess, Data, 0, MEM_RELEASE);

    return thread_args.Module;
}

UCHAR* GetProcAddressRemote(HANDLE hProcess, HMODULE hModule, char *pProcName)
{
    DWORD64 Rip;
    UCHAR Bytes[BYTES_SIZE];
    void *Proc, *Data;
    HANDLE hThread;
    GET_PROC_ADDRESS_THREAD_ARGS thread_args;

    thread_args.pGetProcAddress = GetProcAddress;
    thread_args.Module = hModule;
    thread_args.Proc = NULL;
    strcpy(thread_args.ProcName, pProcName);

    Proc = VirtualAllocEx(hProcess, NULL, 
           GET_PROCESS_ADDRESS_THREAD_ROUTINE_SIZE, MEM_COMMIT, PAGE_EXECUTE_READWRITE);

    Data = VirtualAllocEx(hProcess, NULL, sizeof(thread_args), MEM_COMMIT, PAGE_READWRITE);

    // disable incremental linking to make it work
    WriteProcessMemoryEx(hProcess, (DWORD64)Proc, 
           (UCHAR*)GetProcAddressThreadRoutine, GET_PROCESS_ADDRESS_THREAD_ROUTINE_SIZE);

    WriteProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)Proc, Data, 0, NULL);

    WaitForSingleObject(hThread, INFINITE);

    CloseHandle(hThread);

    ReadProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    VirtualFreeEx(hProcess, Proc, 0, MEM_RELEASE);

    VirtualFreeEx(hProcess, Data, 0, MEM_RELEASE);

    Rip = (DWORD64)thread_args.Proc;

    ReadProcessMemoryEx(hProcess, Rip, Bytes, BYTES_SIZE);

    Rip = GetTargetAddressFromStub(hProcess, Bytes, Rip);

    return (UCHAR*)Rip;
}

void FreeLibraryRemote(HANDLE hProcess, HMODULE hModule)
{
    HANDLE hThread;
    void *Proc, *Data;
    FREE_LIBRARY_THREAD_ARGS thread_args;

    thread_args.pFreeLibrary = FreeLibrary;
    thread_args.Module = hModule;

    Proc = VirtualAllocEx(hProcess, NULL, FREE_LIBRARY_THREAD_ROUTINE_SIZE, 
                          MEM_COMMIT, PAGE_EXECUTE_READWRITE);

    Data = VirtualAllocEx(hProcess, NULL, sizeof(thread_args), MEM_COMMIT, PAGE_READWRITE);

    // disable incremental linking to make it work
    WriteProcessMemoryEx(hProcess, (DWORD64)Proc, (UCHAR*)FreeLibraryThreadRoutine, 
                         FREE_LIBRARY_THREAD_ROUTINE_SIZE);

    WriteProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    hThread = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)Proc, Data, 0, NULL);

    WaitForSingleObject(hThread, INFINITE);

    CloseHandle(hThread);

    ReadProcessMemoryEx(hProcess, (DWORD64)Data, (UCHAR*)&thread_args, sizeof(thread_args));

    VirtualFreeEx(hProcess, Proc, 0, MEM_RELEASE);

    VirtualFreeEx(hProcess, Data, 0, MEM_RELEASE);
}

Now let's turn our attention to a special DLL injected in the target process to implement async hooks. The exported function:

C++
PUBLIC Log
EXTERN LogImp : PROC

.code

Log PROC
    pop rax                                ; get function rip + 5
    sub rax, 5                            ; get function rip
    sub rsp, 48                            ; make space for context structure
    mov qword ptr [rsp], rax            ; store rip
    lea rax, [rsp + 48]                    ; get function rsp
    mov qword ptr [rsp + 8], rax        ; store rsp
    mov qword ptr [rsp + 16], rcx        ; store rcx
    mov qword ptr [rsp + 24], rdx        ; store rdx
    mov qword ptr [rsp + 32], r8        ; store r8
    mov qword ptr [rsp + 40], r9        ; store r9
    mov rcx, rsp                        ; one parameter
    sub rsp, 32                            ; shadow space
    call LogImp                            ; returns address of trampoline
    add rsp, 32                            ; shadow space
    mov r9, qword ptr [rsp + 40]        ; restore r9
    mov r8, qword ptr [rsp + 32]        ; restore r8
    mov rdx, qword ptr [rsp + 24]        ; restore rdx
    mov rcx, qword ptr [rsp + 16]        ; restore rcx
    add rsp, 48                            ; restore rsp
    jmp rax                                ; jump to trampoline
Log ENDP

END

The async hook looks like this:

C++
call $+5
sub rsp, 8
mov dword ptr [rsp], Log                           ; low dword of Log address
mov dword ptr [rsp + 4], Log >> 32         ; high dword of Log address
ret

So the first two instructions of Log function puts the address of hooked function in rax register.

Now LogImp function:

C++
extern "C" DWORD64 LogImp(FUNCTION_CONTEXT *context)
{
    DWORD64 Trampoline;

    EnterCriticalSection(&g_Section);

    send(g_Socket, (const char*)context, sizeof(*context), 0);

    recv(g_Socket, (char*)&Trampoline, sizeof(Trampoline), 0);

    LeaveCriticalSection(&g_Section);

    return Trampoline;
}

Basically, that's it! Thank you for reading.

License

This article, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)