Use the source, part 2: QueryPerformanceCounter

Last time we went “source diving” into GetTickCount. Now let’s try something a bit more complex: QueryPerformanceCounter. Here’s the assembly listing:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret
 
ntdll! ?? ::FNODOBFM::`string`+0x17e60:
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret

ntdll! ?? ::FNODOBFM::`string`+0x17e60:
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret

And here is the relevant section of KUSER_SHARED_DATA (the structure for SharedUserData):

1
2
3
4
5
6
7
8
9
10
0:000> dt KUSER_SHARED_DATA
ole32!KUSER_SHARED_DATA
...
   +0x2ed TscQpcData       : UChar
   +0x2ed TscQpcEnabled    : Pos 0, 1 Bit
   +0x2ed TscQpcSpareFlag  : Pos 1, 1 Bit
   +0x2ed TscQpcShift      : Pos 2, 6 Bits
   +0x2ee TscQpcPad        : [2] UChar
...
   +0x3b8 TscQpcBias       : Uint8B
0:000> dt KUSER_SHARED_DATA
ole32!KUSER_SHARED_DATA
...
   +0x2ed TscQpcData       : UChar
   +0x2ed TscQpcEnabled    : Pos 0, 1 Bit
   +0x2ed TscQpcSpareFlag  : Pos 1, 1 Bit
   +0x2ed TscQpcShift      : Pos 2, 6 Bits
   +0x2ee TscQpcPad        : [2] UChar
...
   +0x3b8 TscQpcBias       : Uint8B

So let’s go through this function one instruction at a time:

1
2
3
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1
ntdll!RtlQueryPerformanceCounter:
00000000`77a1c230 4883ec28        sub     rsp,28h
00000000`77a1c234 f60425ed02fe7f01 test    byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)],1

We start out by testing bit 0, which corresponds to the bitfield TscQpcEnabled.

1
00000000`77a1c23c 4c8bc9          mov     r9,rcx
00000000`77a1c23c 4c8bc9          mov     r9,rcx

We’ll save the first parameter value into r9, which from the docs, we can see is “LARGE_INTEGER *lpPerformanceCount”.

1
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)
00000000`77a1c23f 0f84fbe30600    je      ntdll! ?? ::FNODOBFM::`string`+0x17e60 (00000000`77a8a640)

If the TscQpcEnabled is true, jump to 77a8a640. If you skim ahead, you’ll see that this branch calls into ZwQueryPerformanceCounter. So we have an enable bit that decides whether we can call the kernel version of QueryPerformanceCounter. That’s well and good, but what happens when TscQpcEnabled is false?

1
2
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc
00000000`77a1c245 4c8b0425b803fe7f mov     r8,qword ptr [SharedUserData+0x3b8 (00000000`7ffe03b8)]
00000000`77a1c24d 0f31            rdtsc

Aha! We fall back to a simple rdtsc operation in this case. This puts a 64 bit value in edx:eax with the current processor timestamp. We’ve also load the value of TscQpcBias into r8.

1
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]
00000000`77a1c24f 0fb60c25ed02fe7f movzx   ecx,byte ptr [SharedUserData+0x2ed (00000000`7ffe02ed)]

Load TscQpcData field into ecx…

1
2
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx
00000000`77a1c257 48c1e220        shl     rdx,20h
00000000`77a1c25b 480bc2          or      rax,rdx

We shift rdx left 32 bits and combine it with rax, giving us the full 64-bit timestamp counter in rax rather than eax:edx.

1
00000000`77a1c25e c1e902          shr     ecx,2
00000000`77a1c25e c1e902          shr     ecx,2

And shift ecx (TscQpcData) right two positions to get rid of TscQpcEnabled and TscQpcSpareFlag, giving us TscQpcShift

1
00000000`77a1c261 4903c0          add     rax,r8
00000000`77a1c261 4903c0          add     rax,r8

Now we add in TscQpcBias into the timestamp.

1
00000000`77a1c264 48d3e8          shr     rax,cl
00000000`77a1c264 48d3e8          shr     rax,cl

Now we use the value of TscQpcShift to scale the timestamp.

1
00000000`77a1c267 498901          mov     qword ptr [r9],rax
00000000`77a1c267 498901          mov     qword ptr [r9],rax

Now we move it into the location that r9 points to, which from above, we know is the LARGE_INTEGER *lpPerformanceCount parameter.

1
2
3
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret
00000000`77a1c26a b801000000      mov     eax,1
00000000`77a1c26f 4883c428        add     rsp,28h
00000000`77a1c273 c3              ret

So if we were to rewrite this in pseudo-C, we’d have something like this:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      // ???
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      // ???
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}

Now let’s figure out what happens if TscQpcEnabled is true.

1
2
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)
00000000`77a8a640 488d542438      lea     rdx,[rsp+38h]
00000000`77a8a645 e8766ffbff      call    ntdll!ZwQueryPerformanceCounter (00000000`77a415c0)

We still have our lpPerformanceCount parameter in rcx, so this is passed through, and rdx is set to point to some free space on the stack.

1
2
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)
00000000`77a8a64a 85c0            test    eax,eax
00000000`77a8a64c 790e            jns     ntdll! ?? ::FNODOBFM::`string`+0x17e7c (00000000`77a8a65c)

Here the code checks to see if the return value (in eax) is negative. Strange? Not really, both HRESULTs and NTSTATUS use the high bit to indicate an error. This gives us a very efficient way to check a return code for failure, but still allows us to store additional information about the failure, for instance E_OUTOFMEMORY or E_NOTIMPL.

If we see a success (eax not signed), we jump to 77a8a65c, otherwise we fall through.

1
2
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)
00000000`77a8a64e 8bc8            mov     ecx,eax
00000000`77a8a650 e81bbbfaff      call    ntdll!RtlSetLastWin32ErrorAndNtStatusFromNtStatus (00000000`77a36170)

This is the failure case, and we see the return value being moved to ecx, where it serves as a parameter to RtlSetLastWin32ErrorAndNtStatusFromNtStatus. This sets the value that will be returned from GetLastError if the caller decides to get further information on a failure, since QueryPerformanceCounter only returns a BOOL to indicate success/failure. This is a common pattern in win32 functions.

1
2
3
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret
00000000`77a8a655 33c0            xor     eax,eax
00000000`77a8a657 4883c428        add     rsp,28h
00000000`77a8a65b c3              ret

Now that “LastError” is set, we simply return FALSE by setting eax to 0.

Now we come to the other branch of execution. If ZwQueryPerformanceCounter had succeeded, we end up jumping to 77a8a65c:

1
2
3
4
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a65c 837c243800      cmp     dword ptr [rsp+38h],0
00000000`77a8a661 0f85031cf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)
00000000`77a8a667 837c243c00      cmp     dword ptr [rsp+3Ch],0
00000000`77a8a66c 0f85f81bf9ff    jne     ntdll!RtlQueryPerformanceCounter+0x3a (00000000`77a1c26a)

Now we check the local variable whose address we passed to ZwQueryPerformanceCounter. If either are nonzero, we jump to 77a1c26a, which we already saw as the tail of our success case, returning TRUE from this function. What does it mean for this variable to be zero? We’ll have to dig deeper for that, but for now we can assume that maybe it’s some sort of status code or something.

1
2
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)
00000000`77a8a672 b978000000      mov     ecx,78h
00000000`77a8a677 e8148bfbff      call    ntdll!RtlSetLastWin32Error (00000000`77a43190)

At this point, we know we’ve failed. We call RtlSetLastWin32Error(0x78), which translates to ERROR_CALL_NOT_IMPLEMENTED.

1
2
3
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret
00000000`77a8a67c 33c0            xor     eax,eax
00000000`77a8a67e 4883c428        add     rsp,28h
00000000`77a8a682 c3              ret

And finally, we return FALSE. Now that we’ve looked at the failure branch of QueryPerformanceCounter, lets see if we can come up with a pseudo-C function to describe the whole thing:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      uint64_t foo;
      NTSTATUS status = ZwQueryPerformanceCounter(lpPerformanceCount, &foo);
      if (NT_SUCCESS(status)) {
         if (foo != 0) {
            return TRUE;
         } else {
            RtlSetLastWin32Error(ERROR_CALL_NOT_IMPLEMENTED);
            return FALSE;
         }
      } else {
          RtlSetLastWin32ErrorAndNtStatusFromNtStatus(status);
          return FALSE;
      }
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}
BOOL WINAPI QueryPerformanceCounter(
  _Out_  LARGE_INTEGER *lpPerformanceCount
)
{
   if (SharedUserData.TscQpcEnabled) {
      uint64_t foo;
      NTSTATUS status = ZwQueryPerformanceCounter(lpPerformanceCount, &foo);
      if (NT_SUCCESS(status)) {
         if (foo != 0) {
            return TRUE;
         } else {
            RtlSetLastWin32Error(ERROR_CALL_NOT_IMPLEMENTED);
            return FALSE;
         }
      } else {
          RtlSetLastWin32ErrorAndNtStatusFromNtStatus(status);
          return FALSE;
      }
   } else {
      uint64_t tsc = __rdtsc();
      tsc += SharedUserData.TscQpcBias;
      tsc = tsc >> SharedUserData.TscQpcShift;
      lpPerformanceCount->QuadPart = tsc;
      return TRUE;
   }
}

That about wraps it up. But wait! What happens inside ZwQueryPerformanceCounter? Let’s check:

1
2
3
4
5
6
0:000> u ZwQueryPerformanceCounter
ntdll!ZwQueryPerformanceCounter:
00000000`77a415c0 4c8bd1          mov     r10,rcx
00000000`77a415c3 b82e000000      mov     eax,2Eh
00000000`77a415c8 0f05            syscall
00000000`77a415ca c3              ret
0:000> u ZwQueryPerformanceCounter
ntdll!ZwQueryPerformanceCounter:
00000000`77a415c0 4c8bd1          mov     r10,rcx
00000000`77a415c3 b82e000000      mov     eax,2Eh
00000000`77a415c8 0f05            syscall
00000000`77a415ca c3              ret

All we see here is wrapper around a syscall. This is a standard pattern you see on the Zw* functions, which are usermode wrappers around kernel functions. To dig deeper, we’ll need a kernel debugger, so that’s what we’ll pull out next time.

1 comment

  1. You should strip the excess from the assembly language. It is very wide, which impairs readability. If you get rid of the leading zeroes and ` then you save nine columns. Get rid of the machine code and you save another 16 or so columns. The addresses on the right could also be stripped off. This would get rid of the horizontal scroll bars and make a huge difference. I’d comment on the content but right now much of it is too hard to read. FWIW.

Leave a Reply

Your email address will not be published. Required fields are marked *