Last time we went “source diving” into GetTickCount. Now let’s try something a bit more complex: QueryPerformanceCounter. Here’s the assembly listing:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret
 
ntdll! ?? ::FNODOBFM::`string`+0x17e60:
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret

ntdll! ?? ::FNODOBFM::`string`+0x17e60:
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret

And here is the relevant section of KUSER_SHARED_DATA (the structure for SharedUserData):

1
2
3
4
5
6
7
8
9
10
0:000> dt KUSER_SHARED_DATA
ole32!KUSER_SHARED_DATA
...
   +0x2ed TscQpcData       : UChar
   +0x2ed TscQpcEnabled    : Pos 0, 1 Bit
   +0x2ed TscQpcSpareFlag  : Pos 1, 1 Bit
   +0x2ed TscQpcShift      : Pos 2, 6 Bits
   +0x2ee TscQpcPad        : [2] UChar
...
   +0x3b8 TscQpcBias       : Uint8B
0:000> dt KUSER_SHARED_DATA
ole32!KUSER_SHARED_DATA
...
   +0x2ed TscQpcData       : UChar
   +0x2ed TscQpcEnabled    : Pos 0, 1 Bit
   +0x2ed TscQpcSpareFlag  : Pos 1, 1 Bit
   +0x2ed TscQpcShift      : Pos 2, 6 Bits
   +0x2ee TscQpcPad        : [2] UChar
...
   +0x3b8 TscQpcBias       : Uint8B

So let’s go through this function one instruction at a time:

1
2
3
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1

We start out by testing bit 0, which corresponds to the bitfield TscQpcEnabled.

1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23c 4c8bc9          mov     r9,rcx

We’ll save the first parameter value into r9, which from the docs, we can see is “LARGE_INTEGER *lpPerformanceCount”.

1
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)

If the TscQpcEnabled is true, jump to 77a8a640. If you skim ahead, you’ll see that this branch calls into ZwQueryPerformanceCounter. So we have an enable bit that decides whether we can call the kernel version of QueryPerformanceCounter. That’s well and good, but what happens when TscQpcEnabled is false?

1
2
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc

Aha! We fall back to a simple rdtsc operation in this case. This puts a 64 bit value in edx:eax with the current processor timestamp. We’ve also load the value of TscQpcBias into r8.

1
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]

Load TscQpcData field into ecx…

1
2
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx

We shift rdx left 32 bits and combine it with rax, giving us the full 64-bit timestamp counter in rax rather than eax:edx.

1
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c25e c1e902          shr     ecx,2

And shift ecx (TscQpcData) right two positions to get rid of TscQpcEnabled and TscQpcSpareFlag, giving us TscQpcShift

1
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c261 4903c0          add     rax,r8

Now we add in TscQpcBias into the timestamp.

1
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c264 48d3e8          shr     rax,cl

Now we use the value of TscQpcShift to scale the timestamp.

1
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c267 498901          mov     qword ptr [r9],rax

Now we move it into the location that r9 points to, which from above, we know is the LARGE_INTEGER *lpPerformanceCount parameter.

1
2
3
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret

So if we were to rewrite this in pseudo-C, we’d have something like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      // ???
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      // ???
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}

Now let’s figure out what happens if TscQpcEnabled is true.

1
2
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)

We still have our lpPerformanceCount parameter in rcx, so this is passed through, and rdx is set to point to some free space on the stack.

1
2
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)

Here the code checks to see if the return value (in eax) is negative. Strange? Not really, both HRESULTs and NTSTATUS use the high bit to indicate an error. This gives us a very efficient way to check a return code for failure, but still allows us to store additional information about the failure, for instance E_OUTOFMEMORY or E_NOTIMPL.

If we see a success (eax not signed), we jump to 77a8a65c, otherwise we fall through.

1
2
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)

This is the failure case, and we see the return value being moved to ecx, where it serves as a parameter to RtlSetLastWin32ErrorAndNtStatusFromNtStatus. This sets the value that will be returned from GetLastError if the caller decides to get further information on a failure, since QueryPerformanceCounter only returns a BOOL to indicate success/failure. This is a common pattern in win32 functions.

1
2
3
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret

Now that “LastError” is set, we simply return FALSE by setting eax to 0.

Now we come to the other branch of execution. If ZwQueryPerformanceCounter had succeeded, we end up jumping to 77a8a65c:

1
2
3
4
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)

Now we check the local variable whose address we passed to ZwQueryPerformanceCounter. If either are nonzero, we jump to 77a1c26a, which we already saw as the tail of our success case, returning TRUE from this function. What does it mean for this variable to be zero? We’ll have to dig deeper for that, but for now we can assume that maybe it’s some sort of status code or something.

1
2
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)

At this point, we know we’ve failed. We call RtlSetLastWin32Error(0x78), which translates to ERROR_CALL_NOT_IMPLEMENTED.

1
2
3
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret

And finally, we return FALSE. Now that we’ve looked at the failure branch of QueryPerformanceCounter, lets see if we can come up with a pseudo-C function to describe the whole thing:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      uint64_t foo;
      NTSTATUS status = ZwQueryPerformanceCounter(lpPerformanceCount, &foo);
      if (NT_SUCCESS(status)) {
         if (foo != 0) {
            return TRUE;
         } else {
            RtlSetLastWin32Error(ERROR_CALL_NOT_IMPLEMENTED);
            return FALSE;
         }
      } else {
          RtlSetLastWin32ErrorAndNtStatusFromNtStatus(status);
          return FALSE;
      }
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      uint64_t foo;
      NTSTATUS status = ZwQueryPerformanceCounter(lpPerformanceCount, &foo);
      if (NT_SUCCESS(status)) {
         if (foo != 0) {
            return TRUE;
         } else {
            RtlSetLastWin32Error(ERROR_CALL_NOT_IMPLEMENTED);
            return FALSE;
         }
      } else {
          RtlSetLastWin32ErrorAndNtStatusFromNtStatus(status);
          return FALSE;
      }
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}

That about wraps it up. But wait! What happens inside ZwQueryPerformanceCounter? Let’s check:

1
2
3
4
5
6
0:000> u ZwQueryPerformanceCounter
ntdll!ZwQueryPerformanceCounter:
00000000`77a415c0 4c8bd1          mov     r10,rcx
00000000`77a415c3 b82e000000      mov     eax,2Eh
00000000`77a415c8 0f05            syscall
00000000`77a415ca c3              ret
0:000> u ZwQueryPerformanceCounter
ntdll!ZwQueryPerformanceCounter:
00000000`77a415c0 4c8bd1          mov     r10,rcx
00000000`77a415c3 b82e000000      mov     eax,2Eh
00000000`77a415c8 0f05            syscall
00000000`77a415ca c3              ret

All we see here is wrapper around a syscall. This is a standard pattern you see on the Zw* functions, which are usermode wrappers around kernel functions. To dig deeper, we’ll need a kernel debugger, so that’s what we’ll pull out next time.

The other day, a coworker mentioned that he wasn’t using high resolution timers on Windows because he heard “they caused clock drift”. His statement struck me as odd. Why would checking the time change the time? Are we causing some sort of quantum observer effect? So I asked him: why don’t we just check what the function does?

True, we can’t just open up the source files, but we have something almost as good: the binary itself! A short C function almost always translates into a short assembly function, and GetTickCount is no different. To follow along at home, fire up windbg attached to a 64 bit process (any will do, like notepad) and run “uf KERNELBASE!GetTickCount”. We see this:


0:000> uf KERNELBASE!GetTickCount
KERNELBASE!GetTickCount:
000007ff`14211250 b92003fe7f mov ecx,offset SharedUserData+0x320 (00000000`7ffe0320)
000007ff`14211255 488b09 mov rcx,qword ptr [rcx]
000007ff`14211258 8b04250400fe7f mov eax,dword ptr [SharedUserData+0x4 (00000000`7ffe0004)]
000007ff`1421125f 480fafc1 imul rax,rcx
000007ff`14211263 48c1e818 shr rax,18h
000007ff`14211267 c3 ret

We can analyze this function step by step, but from a quick glance, we see we have an immediate load, two memory reads, a multiply, and a shift. It seems unlikely that this will cause any “clock drift”. So how is it possible that we can read the current tick count so simply? A quick google search reveals that this is the “shared user page”, a page mapped into all usermode processes. All the kernel needs to do is update this value every time a clock interrupt occurs and magically we have a GetTickCount function that is so fast it’s nearly free. And no worries about a clock drift.

So this means we’ve dispelled the myth of the mysterious clock drift, right? Well, not exactly. We know that GetTickCount seems unlikely to ever cause the clock to drift, but it turns out there is a real “clock drift problem”, but that deserves its own blog post.

I heard an interesting interview question recently. The question is: “How does the ‘dir’ command work?”. At the surface, it’s an easy question, but we can use this question to drive all the way down through the internals of the operating system.

Let’s start at the top, from a user’s perspective. A user is sitting at a command prompt in windows (cmd.exe), and types “dir”. The first thing that happens is that the command goes to a parser. The parser looks for internal commands before looking for a program named “dir”, since a large amount of functionality is implemented as built-in commands. The parser finds the internal command for dir, and executes it as a subroutine of cmd.exe.

At this point, the internal dir command will use the windows APIs to enumerate the files in the current directory. In windows, file enumeration is done through the FindFirstFile/FindNextFile functions. These functions are located in the kernel32.dll module. Despite it’s name, kernel32 is a user mode module that resides in the same address space as cmd.exe. How can we figure out what FindFirstFile does? We break out the debugger, of course!

To follow along, grab a copy of windbg, and start a debugging session for cmd.exe. You should be able to follow along with any version of windows, but I’m using Windows 7 for this experiment.

After letting the command prompt start up, we’ll break into the debugger and set a breakpoint on kernel32!FindFirstFileExW. I discovered this was the right breakpoint by setting a breakpoint on all of the FindFirstFile functions and seeing this one get called first. Now that we have a breakpoint set, resume execution of the target process with a “g” command, and head back over to the console we are debugging. Enter a “dir” command, and if all is well, we’ll immediately break back into the debugger at FindFirstFileExW.

The line you see will likely look something like this:

00000000`7767c4a8 ff25b20f0800 jmp qword ptr [kernel32!_imp_FindFirstFileExW (00000000`776fd460)] ds:00000000`776fd460={KERNELBASE!FindFirstFileExW (000007fe`fd724d40)}

This is an indirect jump using a symbol called “_imp_FindFirstFileExW”. Why an indirect jump? Shouldn’t kernel32.dll know where FindFirstFileExW is? Actually, it turns out that the “real” FindFirstFileExW is in kernelbase.dll, and the kernel32 version just calls through to the imported function in kernelbase. So let’s step into the “real version” in kernelbase. You can use a “t” command in windbg to do this.

Now we see something that looks more like a real function:

KERNELBASE!FindFirstFileExW:
000007fe`fd724d40 fff3 push rbx
000007fe`fd724d42 56 push rsi

Since we have a live debugging session, we have a huge advantage when analyzing what the function does, because we can trace through the function as it executes rather than trying to guess what it would do while looking at the disassembly. One of the most powerful commands for this is the “wt” command, which means “watch and trace”. It traces through a function execution, listing out the function names each time a function is called or a function returns. It also has the nice feature of listing all the system calls that are executed. In the summary for FindFirstFileExW, we see the following system calls:

2 system calls were executed

Calls System Call
1 ntdll!NtQueryDirectoryFile
1 ntdll!ZwOpenFile

If you look at the trace from wt, you’ll see that ZwOpenFile comes first, and then NtQueryDirectoryFile. This makes sense, once you realize that a directory is a type of file. It is first opened with ZwOpenFile, and then we can query information from it using NtQueryDirectoryFile. Both of these functions are documented on the msdn website, as these functions are the entrypoint into kernel from usermode, but are also available to kernel-mode components such as drivers.

How does ZwOpenFile and NtQueryDirectoryFile work then? In windows, many sources of data are mapped into a single unified namespace. You can see this when you map a network drive and access it as if it were a local disk. You can also see this in a more subtle way when you have two filesystems, say fat32 and ntfs, both accessible through the same APIs. Internally, this is implemented with tables of handler functions that depend on the location within the namespace. In other words, querying for files on an NTFS volume routes the requests through the NTFS driver, while FAT32 volumes route requests through the FAT32 driver. The filesystem drivers are layered on top of the disk drivers, until eventually we get to a request for a block on the physical location of the data (like a disk drive).

This was a bit of a whirlwind tour, but should give you a good idea of what happens when you type “dir” at the console. I think if I write another post like this, I’ll take a look at how “malloc” works on a Windows system.

At the highest levels of abstraction, computer science is pure math. One can write proofs about the behavior of a program, including proofs of correctness, algorithmic complexity, and other things. At the lowest levels of computer engineering, we see physics applied to understand and plan the behavior of silicon at the quantum level. What’s in the middle? In many levels, we see more art than science, and this is particularly true of assembly language programming. Over time, programmers who delve into this abyss learn certain “tricks of the trade” that are difficult to teach outside of experience. One of the skills I’ve learned is “recognizing code. That is, seeing some hex bytes and recognizing them as code, data, or garbage.

Some of the tricks are easy, for instance, if you ever see “CC CC CC …” you can be nearly certain that you are looking at code. Why? 0xCC is a single byte x86 opcode representing “int 3”, which is the “breakpoint interrupt”. This one byte interrupt encoding is very useful when setting “soft breakpoints” (as opposed to hardware breakpoints) where a single byte of an instruction stream can be changed to CC to insert a breakpoint.

Sometimes it requires a bit of knowledge of the instruction encodings to make inferences. By default, most x86 instructions (both 32-bit and 64-bit) default to 32-bit operand sizes. In 64-bit code, this can be changed to 64-bit operands, as well as extending the number of registers available, through the use of the “REX” prefixes. These prefixes can extend the size and range of the operands being encoded. The commonality of the REX prefixes, is that they are in the range of 0x40-0x4F. Here’s a random snippet of ntdll, which I grabbed with windbg:

0:000> db .
00000000`778e2e85 90 e9 af ef fd ff 66 44-39 2f 0f 84 bd ef fd ff ......fD9/......
00000000`778e2e95 48 8d 94 24 90 00 00 00-4c 8b c7 48 8b cf 4d 89 H..$....L..H..M.

We see 0x4* show up a number of times in this sequence, and sure enough, every single one of these is a REX byte:

00000000`778e2e85 90 nop
00000000`778e2e86 e9afeffdff jmp ntdll!LdrQueryImageFileKeyOption+0x4da
00000000`778e2e8b 6644392f cmp word ptr [rdi],r13w
00000000`778e2e8f 0f84bdeffdff je ntdll!LdrQueryImageFileKeyOption+0x4f2
00000000`778e2e95 488d942490000000 lea rdx,[rsp+90h]
00000000`778e2e9d 4c8bc7 mov r8,rdi
00000000`778e2ea0 488bcf mov rcx,rdi

This trick only works on 64-bit code, however, since on 32-bit code those code bytes map to the single byte versions of the “INC” and “DEC” instructions.

Looking at code bytes is well and good, but usually we have a disassembler handy to look at the actual instructions. Here’s a longer section of ntdll:

00000000`778e2e86 e9afeffdff jmp ntdll!LdrQueryImageFileKeyOption+0x4da
00000000`778e2e8b 6644392f cmp word ptr [rdi],r13w
00000000`778e2e8f 0f84bdeffdff je ntdll!LdrQueryImageFileKeyOption+0x4f2
00000000`778e2e95 488d942490000000 lea rdx,[rsp+90h]
00000000`778e2e9d 4c8bc7 mov r8,rdi
00000000`778e2ea0 488bcf mov rcx,rdi
00000000`778e2ea3 4d89afe0020000 mov qword ptr [r15+2E0h],r13
00000000`778e2eaa e8d1770900 call ntdll!EtwDeliverDataBlock+0x220
00000000`778e2eaf 90 nop
00000000`778e2eb0 e99deffdff jmp ntdll!LdrQueryImageFileKeyOption+0x4f2
00000000`778e2eb5 8b0dd57a0d00 mov ecx,dword ptr [ntdll!LdrSystemDllInitBlock+0xb0]
00000000`778e2ebb f6c103 test cl,3
00000000`778e2ebe 7431 je ntdll!RtlIsDosDeviceName_U+0x94d1

We can see a number of signs that immediately tell us that this is real code. For a first step, we see a number of comparisons followed immediately by a conditional jump. Also, we see a call preceded by setting rcx/r8/rdx. On x64, parameters are passed in rcx, rdx, r8, and r9. Finally, we see a number of jumps/calls to locations within the same module, which we can see by the module!export+offset where the offset is relatively small. This one can be misleading at times, however, since a jump with an 8-bit relative offset will almost always be within the same module, and might look valid even if it isn’t.

Being able to recognize code can be a useful skill. When debugging without source code, it’s often useful to be able to tell if a destination address (through an indirect call for instance) is point to real code. When writing an emulator/disassembler, it’s useful to see if the instruction boundaries have been successfully determined. When looking at a crash, it’s useful to see if real code is executing, or if the instruction pointer is somehow pointing to a data section (through stack corruption or a bad indirect jump). There are often other ways to determine if a piece of memory is actually code, but there are times when these methods fail, such as dynamically generated code (for instance, from a jitter or from a buffer overflow attack). I’ve even seen bugs that we were able to solve quickly by realizing that executable code had somehow been loaded into a register!