int operators != and == when comparing to zero

I've found that != and == are not the fastest ways for testing for zero or non-zero.

bool nonZero1 = integer != 0;
xor eax, eax
test ecx, ecx
setne al

bool nonZero2 = integer < 0 || integer > 0;
test ecx, ecx
setne al

bool zero1 = integer == 0;
xor eax, eax
test ecx, ecx
sete al

bool zero2 = !(integer < 0 || integer > 0);
test ecx, ecx
sete al

Compiler: VC++ 11 Optimization flags: /O2 /GL /LTCG

This is the assembly output for x86-32. The second versions of both comparisons were ~12% faster on both x86-32 and x86-64. However, on x86-64 the instructions were identical (first versions looked exactly like the second versions), but the second versions were still faster.

  • Why doesn't the compiler generate the faster version on x86-32?
  • Why are the second versions still faster on x86-64 when the assembly output is identical?
  • EDIT: I've added benchmarking code. ZERO: 1544ms, 1358ms NON_ZERO: 1544ms, 1358ms http://pastebin.com/m7ZSUrcP or http://anonymouse.org/cgi-bin/anon-www.cgi/http://pastebin.com/m7ZSUrcP

    Note: It's probably inconvenient to locate these functions when compiled in a single source file, because main.asm goes quite big. I had zero1, zero2, nonZero1, nonZero2 in a separate source file.

    EDIT2: Could someone with both VC++11 and VC++2010 installed run the benchmarking code and post the timings? It might indeed be a bug in VC++11.


    EDIT: Saw OP's assembly listing for my code. I doubt this is even a general bug with VS2011 now. This may simply be a special case bug for OP's code. I ran OP's code as-is with clang 3.2, gcc 4.6.2 and VS2010 and in all cases the max differences were at ~1%.

    Just compiled the sources with suitable modifications to my ne.c file and the /O2 and /GL flags. Here's the source

    int ne1(int n) {
     return n != 0;
     }
    
     int ne2(int n) {
     return n < 0 || n > 0;
     }
    
     int ne3(int n) {
     return !(n == 0);
     }
    
    int main() { int p = ne1(rand()), q = ne2(rand()), r = ne3(rand());}
    

    and the corresponding assembly:

        ; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.30319.01 
    
        TITLE   D:llvm_workspacetestsne.c
        .686P
        .XMM
        include listing.inc
        .model  flat
    
    INCLUDELIB OLDNAMES
    
    EXTRN   @__security_check_cookie@4:PROC
    EXTRN   _rand:PROC
    PUBLIC  _ne3
    ; Function compile flags: /Ogtpy
    ;   COMDAT _ne3
    _TEXT   SEGMENT
    _n$ = 8                         ; size = 4
    _ne3    PROC                        ; COMDAT
    ; File d:llvm_workspacetestsne.c
    ; Line 11
        xor eax, eax
        cmp DWORD PTR _n$[esp-4], eax
        setne   al
    ; Line 12
        ret 0
    _ne3    ENDP
    _TEXT   ENDS
    PUBLIC  _ne2
    ; Function compile flags: /Ogtpy
    ;   COMDAT _ne2
    _TEXT   SEGMENT
    _n$ = 8                         ; size = 4
    _ne2    PROC                        ; COMDAT
    ; Line 7
        xor eax, eax
        cmp eax, DWORD PTR _n$[esp-4]
        sbb eax, eax
        neg eax
    ; Line 8
        ret 0
    _ne2    ENDP
    _TEXT   ENDS
    PUBLIC  _ne1
    ; Function compile flags: /Ogtpy
    ;   COMDAT _ne1
    _TEXT   SEGMENT
    _n$ = 8                         ; size = 4
    _ne1    PROC                        ; COMDAT
    ; Line 3
        xor eax, eax
        cmp DWORD PTR _n$[esp-4], eax
        setne   al
    ; Line 4
        ret 0
    _ne1    ENDP
    _TEXT   ENDS
    PUBLIC  _main
    ; Function compile flags: /Ogtpy
    ;   COMDAT _main
    _TEXT   SEGMENT
    _main   PROC                        ; COMDAT
    ; Line 14
        call    _rand
        call    _rand
        call    _rand
        xor eax, eax
        ret 0
    _main   ENDP
    _TEXT   ENDS
    END
    

    ne2() which used the < , > and || operators is clearly more expensive. ne1() and ne3() which use the == and != operators respectively, are terser and equivalent.

    Visual Studio 2011 is in beta. I would consider this as a bug. My tests with two other compilers namely gcc 4.6.2 and clang 3.2, with the O2 optimization switch yielded the exact same assembly for all three tests (that I had) on my Windows 7 box. Here's a summary:

    $ cat ne.c
    
    #include <stdbool.h>
    bool ne1(int n) {
        return n != 0;
    }
    
    bool ne2(int n) {
        return n < 0 || n > 0;
    }
    
    bool ne3(int n) {
        return !(n != 0);
    }
    
    int main() {}
    

    yields with gcc:

    _ne1:
    LFB0:
        .cfi_startproc
        movl    4(%esp), %eax
        testl   %eax, %eax
        setne   %al
        ret
        .cfi_endproc
    LFE0:
        .p2align 2,,3
        .globl  _ne2
        .def    _ne2;   .scl    2;  .type   32; .endef
    _ne2:
    LFB1:
        .cfi_startproc
        movl    4(%esp), %edx
        testl   %edx, %edx
        setne   %al
        ret
        .cfi_endproc
    LFE1:
        .p2align 2,,3
        .globl  _ne3
        .def    _ne3;   .scl    2;  .type   32; .endef
    _ne3:
    LFB2:
        .cfi_startproc
        movl    4(%esp), %ecx
        testl   %ecx, %ecx
        sete    %al
        ret
        .cfi_endproc
    LFE2:
        .def    ___main;    .scl    2;  .type   32; .endef
        .section    .text.startup,"x"
        .p2align 2,,3
        .globl  _main
        .def    _main;  .scl    2;  .type   32; .endef
    _main:
    LFB3:
        .cfi_startproc
        pushl   %ebp
        .cfi_def_cfa_offset 8
        .cfi_offset 5, -8
        movl    %esp, %ebp
        .cfi_def_cfa_register 5
        andl    $-16, %esp
        call    ___main
        xorl    %eax, %eax
        leave
        .cfi_restore 5
        .cfi_def_cfa 4, 4
        ret
        .cfi_endproc
    LFE3:
    

    and with clang:

        .def     _ne1;
        .scl    2;
        .type   32;
        .endef
        .text
        .globl  _ne1
        .align  16, 0x90
    _ne1:
        cmpl    $0, 4(%esp)
        setne   %al
        movzbl  %al, %eax
        ret
    
        .def     _ne2;
        .scl    2;
        .type   32;
        .endef
        .globl  _ne2
        .align  16, 0x90
    _ne2:
        cmpl    $0, 4(%esp)
        setne   %al
        movzbl  %al, %eax
        ret
    
        .def     _ne3;
        .scl    2;
        .type   32;
        .endef
        .globl  _ne3
        .align  16, 0x90
    _ne3:
        cmpl    $0, 4(%esp)
        sete    %al
        movzbl  %al, %eax
        ret
    
        .def     _main;
        .scl    2;
        .type   32;
        .endef
        .globl  _main
        .align  16, 0x90
    _main:
        pushl   %ebp
        movl    %esp, %ebp
        calll   ___main
        xorl    %eax, %eax
        popl    %ebp
        ret
    

    My suggestion would be to file this as a bug with Microsoft Connect.

    Note: I compiled them as C source since I don't think using the corresponding C++ compiler would make any significant change here.


    This is a great question, but I think you've fallen victim to the compiler's dependency analysis.

    The compiler only has to clear the high bits of eax once, and they remain clear for the second version. The second version would have to pay the price to xor eax, eax except that the compiler analysis proved it's been left cleared by the first version.

    The second version is able to "cheat" by taking advantage of work the compiler did in the first version.

    How are you measuring times? Is it "(version one, followed by version two) in a loop", or "(version one in a loop) followed by (version two in a loop)"?

    Don't do both tests in the same program (instead recompile for each version), or if you do, test both "version A first" and "version B first" and see if whichever comes first is paying a penalty.


    Illustration of the cheating:

    timer1.start();
    double x1 = 2 * sqrt(n + 37 * y + exp(z));
    timer1.stop();
    timer2.start();
    double x2 = 31 * sqrt(n + 37 * y + exp(z));
    timer2.stop();
    

    If timer2 duration is less than timer1 duration, we don't conclude that multiplying by 31 is faster than multiplying by 2. Instead, we realize that the compiler performed common subexpression analysis, and the code became:

    timer1.start();
    double common = sqrt(n + 37 * y + exp(z));
    double x1 = 2 * common;
    timer1.stop();
    timer2.start();
    double x2 = 31 * common;
    timer2.stop();
    

    And the only thing proved is that multiplying by 31 is faster than computing common . Which is hardly surprising at all -- multiplication is far far faster than sqrt and exp .

    链接地址: http://www.djcxy.com/p/31506.html

    上一篇: Python比C ++更快? 这是如何发生的?

    下一篇: int运算符!=和==比较时为零