Need explanation on assembly instructions of K&R fahr

I am stuck learning basics of assembly language with fahrenheit to celsius example from K&R book. Here is C code that I am referring to:

#include <stdio.h>

main()
{
    int fahr, celsius;
    int lower, upper, step;

    lower = 0;
    upper = 300;
    step = 20;

    fahr = lower;
    while (fahr <= upper) {
        celsius = 5 * (fahr-32) / 9;
        printf("%dt%dn", fahr, celsius);
        fahr = fahr + step;
    }
}

Along with GCC 4.4.7 (GNU/Linux x86-64) I get following disassembly:

$ gcc -O0 -g -ansi -pedantic l1-2a.c
$ gdb -q a.out
(gdb) disas /m main
(gdb) disas /m main
Dump of assembler code for function main:
6   {
   0x00000000004004c4 <+0>: push   %rbp
   0x00000000004004c5 <+1>: mov    %rsp,%rbp
   0x00000000004004c8 <+4>: sub    $0x20,%rsp

7       int fahr, celsius;
8       int lower, upper, step;
9   
10      lower = 0;
   0x00000000004004cc <+8>: movl   $0x0,-0xc(%rbp)

11      upper = 300;
   0x00000000004004d3 <+15>:    movl   $0x12c,-0x8(%rbp)

12      step = 20;
   0x00000000004004da <+22>:    movl   $0x14,-0x4(%rbp)

13  
14      fahr = lower;
   0x00000000004004e1 <+29>:    mov    -0xc(%rbp),%eax
   0x00000000004004e4 <+32>:    mov    %eax,-0x14(%rbp)

15      while (fahr <= upper) {
   0x00000000004004e7 <+35>:    jmp    0x400532 <main+110>
   0x0000000000400532 <+110>:   mov    -0x14(%rbp),%eax
   0x0000000000400535 <+113>:   cmp    -0x8(%rbp),%eax
   0x0000000000400538 <+116>:   jle    0x4004e9 <main+37>

16          celsius = 5 * (fahr-32) / 9;
   0x00000000004004e9 <+37>:    mov    -0x14(%rbp),%edx
   0x00000000004004ec <+40>:    mov    %edx,%eax
   0x00000000004004ee <+42>:    shl    $0x2,%eax
   0x00000000004004f1 <+45>:    add    %edx,%eax
   0x00000000004004f3 <+47>:    lea    -0xa0(%rax),%ecx
   0x00000000004004f9 <+53>:    mov    $0x38e38e39,%edx
   0x00000000004004fe <+58>:    mov    %ecx,%eax
   0x0000000000400500 <+60>:    imul   %edx
   0x0000000000400502 <+62>:    sar    %edx
   0x0000000000400504 <+64>:    mov    %ecx,%eax
   0x0000000000400506 <+66>:    sar    $0x1f,%eax
   0x0000000000400509 <+69>:    mov    %edx,%ecx
   0x000000000040050b <+71>:    sub    %eax,%ecx
   0x000000000040050d <+73>:    mov    %ecx,%eax
   0x000000000040050f <+75>:    mov    %eax,-0x10(%rbp)

17          printf("%dt%dn", fahr, celsius);
   0x0000000000400512 <+78>:    mov    $0x400638,%eax
   0x0000000000400517 <+83>:    mov    -0x10(%rbp),%edx
   0x000000000040051a <+86>:    mov    -0x14(%rbp),%ecx
   0x000000000040051d <+89>:    mov    %ecx,%esi
   0x000000000040051f <+91>:    mov    %rax,%rdi
   0x0000000000400522 <+94>:    mov    $0x0,%eax
   0x0000000000400527 <+99>:    callq  0x4003b8 <printf@plt>

18          fahr = fahr + step;
   0x000000000040052c <+104>:   mov    -0x4(%rbp),%eax
   0x000000000040052f <+107>:   add    %eax,-0x14(%rbp)

19      }
20  }
   0x000000000040053a <+118>:   leaveq 
   0x000000000040053b <+119>:   retq   

End of assembler dump.

What is not clear for me is this fragment:

16          celsius = 5 * (fahr-32) / 9;
   0x00000000004004e9 <+37>:    mov    -0x14(%rbp),%edx
   0x00000000004004ec <+40>:    mov    %edx,%eax
   0x00000000004004ee <+42>:    shl    $0x2,%eax
   0x00000000004004f1 <+45>:    add    %edx,%eax
   0x00000000004004f3 <+47>:    lea    -0xa0(%rax),%ecx
   0x00000000004004f9 <+53>:    mov    $0x38e38e39,%edx
   0x00000000004004fe <+58>:    mov    %ecx,%eax
   0x0000000000400500 <+60>:    imul   %edx
   0x0000000000400502 <+62>:    sar    %edx
   0x0000000000400504 <+64>:    mov    %ecx,%eax
   0x0000000000400506 <+66>:    sar    $0x1f,%eax
   0x0000000000400509 <+69>:    mov    %edx,%ecx
   0x000000000040050b <+71>:    sub    %eax,%ecx
   0x000000000040050d <+73>:    mov    %ecx,%eax
   0x000000000040050f <+75>:    mov    %eax,-0x10(%rbp)

I mean that I understand everything up to:

lea    -0xa0(%rax),%ecx

as it is substracting 160 from %eax register, that holds 5*fahr , as:

5 * (fahr-32) / 9 <=> (5*fahr - 5*32) / 9 <=> (5*fahr - 160) / 9

thus after %ecx (as well as complete %rcx ) stores 5*fahr - 160 . However I don't how it's dividing by 9 then. It seems to be some trickery like "multiply by the inverse" in order to avoid division, but I don't get how it works.


Summing up what was said in the comments: 0x38e38e39 is 954437177 in decimal, which is exactly (2^33 + 1) / 9 . So, the assembly code works this way (I've replaced (5 * fahr - 160) with X for clarity):

mov    $0x38e38e39,%edx /* edx is now 0x38e38e39 == (2^33 + 1) / 9 */
mov    %ecx,%eax        /* eax is now X */
imul   %edx             /* edx:eax is now eax * edx == X * ((2^33 + 1) / 9) */

That's where the fun part starts. edx:eax stands for the 1-operand imul first filling up its operand's ( edx in this case) 32 bits, and then putting the remaining lower bits into eax .

Effectively, we get a 64-bit result across two registers! It looks something like this:

edx is the 32 least significant bits of (X * ((2^33 + 1) / 9)) >> 32 .

eax is (X * ((2^33 + 1) / 9)) % 2^32 (but that's going to be discarded soon)

Then we get this stuff into shape:

sar    %edx             /* edx is now edx >> 1 == (X * ((2^33 + 1) / 9)) >> 33 */
mov    %ecx,%eax        /* eax is now X again */
sar    $0x1f,%eax       /* eax is now X >> 0x1f == X >> 31 */
mov    %edx,%ecx        /* ecx is now (X * ((2^33 + 1) / 9)) >> 33 */

So now ecx is the 32 least significant bits of (X * ((2^33 + 1) / 9)) >> 33 and eax is X >> 31 , ie 32 "sign bit"-s of X (which is a signed 32-bit integer), which are equal to 0 if X is non-negative and to -1 if X is negative.

EDIT: detailed elaboration on the special case of negative X

Now a bit about what happens with negative numbers. The important part about ecx is that it actually is the 32 most significant bits of X * ((2^33 + 1) / 9) .

As I hope you remember, in binary, negating a number means inverting all of its bits and then adding 1 to it. And when we add 1 , we invert the lsb to 1 if it was 0 , otherwise we invert it and all the bits after it 'till we find the first 0 and then invert it too.

So what happens when we try to negate (X * ((2^33 + 1) / 9)) (or, equivalently, what do we get if we perform calculations with -X , considering X positive for this example)? Of course, first we invert all of its bits, then we add 1 to it. But for the latter (adding 1 to it) to affect the 32 most siginficant bits of the number, the 32 least significant bits would have to equal 0xFFFFFFFF . And (trust me on this one) there is no 32-bit integer which, when multiplied by 0x38e38e39 , gives such a result.

So effectively, while (-X * ((2^33 + 1) / 9)) == -(X * ((2^33 + 1) / 9)) , it's different with the 32 most significant bits: ((-X * ((2^33 + 1) / 9)) >> 33) & 0xFFFFFFFF != -(((X * ((2^33 + 1) / 9)) >> 33) & 0xFFFFFFFF) .

Instead, the 32 most significant bits of (-X * ((2^33 + 1) / 9)) equal to the bitwise negation of the 32 most significant bits of (X * ((2^33 + 1) / 9)) : ((-X * ((2^33 + 1) / 9)) >> 33) & 0xFFFFFFFF != ~(((X * ((2^33 + 1) / 9)) >> 33) & 0xFFFFFFFF) .

Tl;dr for negative X case: the value of ecx for -X will be equal to the bitwise negation of the value of ecx for X . We don't want that. So to get correct results for negative values of X , we'll have to add 1 to ecx (or, equivalently, subtract -1 ):

sub    %eax,%ecx        /* ecx is now X / 9 */

Then comes the last part:

mov    %ecx,%eax        /* eax is now X / 9 */
mov    %eax,-0x10(%rbp) /* Aaand mov the result into the variable "cels" */

I'm very sorry if I mixed something up, I have trouble keeping my head around asm written in GAS syntax, but I hope you understand the idea.

Tl;dr: the trick here is multiplying by the inverse multiplied by a large number, discarding the large number with an arithmetic shift, then rounding the result to zero if it's negative.

Why all the bother?

As a result, we stuffed division into 10 cycles (considering imul takes only one cycle too). Considering idiv can take up to almost twice as much cycles (from 11 to 18 as mentioned by Hans Passant in this answer to a similar question), this approach can make a huge performance advantage.

链接地址: http://www.djcxy.com/p/82766.html

上一篇: 为什么我的想法在python2中不起作用?

下一篇: 需要解释K&R fahr的组装说明