> vs. >= causes significant performance difference

I just stumbled upon something. At first I thought it might be a case of branch misprediction like it is in this case, but I cannot explain why branch misprediction should cause this phenomenon. I implemented two versions of Bubble Sort in Java and did some performance tests:

import java.util.Random;

public class BubbleSortAnnomaly {

    public static void main(String... args) {
        final int ARRAY_SIZE = Integer.parseInt(args[0]);
        final int LIMIT = Integer.parseInt(args[1]);
        final int RUNS = Integer.parseInt(args[2]);

        int[] a = new int[ARRAY_SIZE];
        int[] b = new int[ARRAY_SIZE];
        Random r = new Random();
        for (int run = 0; RUNS > run; ++run) {
            for (int i = 0; i < ARRAY_SIZE; i++) {
                a[i] = r.nextInt(LIMIT);
                b[i] = a[i];
            }

            System.out.print("Sorting with sortA: ");
            long start = System.nanoTime();
            int swaps = bubbleSortA(a);

            System.out.println(  (System.nanoTime() - start) + " ns. "
                               + "It used " + swaps + " swaps.");

            System.out.print("Sorting with sortB: ");
            start = System.nanoTime();
            swaps = bubbleSortB(b);

            System.out.println(  (System.nanoTime() - start) + " ns. "
                               + "It used " + swaps + " swaps.");
        }
    }

    public static int bubbleSortA(int[] a) {
        int counter = 0;
        for (int i = a.length - 1; i >= 0; --i) {
            for (int j = 0; j < i; ++j) {
                if (a[j] > a[j + 1]) {
                    swap(a, j, j + 1);
                    ++counter;
                }
            }
        }
        return (counter);
    }

    public static int bubbleSortB(int[] a) {
        int counter = 0;
        for (int i = a.length - 1; i >= 0; --i) {
            for (int j = 0; j < i; ++j) {
                if (a[j] >= a[j + 1]) {
                    swap(a, j, j + 1);
                    ++counter;
                }
            }
        }
        return (counter);
    }

    private static void swap(int[] a, int j, int i) {
        int h = a[i];
        a[i] = a[j];
        a[j] = h;
    }
}

As you can see, the only difference between those two sorting methods is the > vs. >= . When running the program with java BubbleSortAnnomaly 50000 10 10 , you would obviously expect that sortB is slower than sortA . But I got the following (or similar) output on three different machines:

Sorting with sortA: 4.214 seconds. It used  564960211 swaps.
Sorting with sortB: 2.278 seconds. It used 1249750569 swaps.
Sorting with sortA: 4.199 seconds. It used  563355818 swaps.
Sorting with sortB: 2.254 seconds. It used 1249750348 swaps.
Sorting with sortA: 4.189 seconds. It used  560825110 swaps.
Sorting with sortB: 2.264 seconds. It used 1249749572 swaps.
Sorting with sortA: 4.17  seconds. It used  561924561 swaps.
Sorting with sortB: 2.256 seconds. It used 1249749766 swaps.
Sorting with sortA: 4.198 seconds. It used  562613693 swaps.
Sorting with sortB: 2.266 seconds. It used 1249749880 swaps.
Sorting with sortA: 4.19  seconds. It used  561658723 swaps.
Sorting with sortB: 2.281 seconds. It used 1249751070 swaps.
Sorting with sortA: 4.193 seconds. It used  564986461 swaps.
Sorting with sortB: 2.266 seconds. It used 1249749681 swaps.
Sorting with sortA: 4.203 seconds. It used  562526980 swaps.
Sorting with sortB: 2.27  seconds. It used 1249749609 swaps.
Sorting with sortA: 4.176 seconds. It used  561070571 swaps.
Sorting with sortB: 2.241 seconds. It used 1249749831 swaps.
Sorting with sortA: 4.191 seconds. It used  559883210 swaps.
Sorting with sortB: 2.257 seconds. It used 1249749371 swaps.

When you set the parameter for LIMIT to, eg, 50000 ( java BubbleSortAnnomaly 50000 50000 10 ), you get the expected results:

Sorting with sortA: 3.983 seconds. It used  625941897 swaps.
Sorting with sortB: 4.658 seconds. It used  789391382 swaps.

I ported the program to C++ to determine whether this problem is Java-specific. Here is the C++ code.

#include <cstdlib>
#include <iostream>

#include <omp.h>

#ifndef ARRAY_SIZE
#define ARRAY_SIZE 50000
#endif

#ifndef LIMIT
#define LIMIT 10
#endif

#ifndef RUNS
#define RUNS 10
#endif

void swap(int * a, int i, int j)
{
    int h = a[i];
    a[i] = a[j];
    a[j] = h;
}

int bubbleSortA(int * a)
{
    const int LAST = ARRAY_SIZE - 1;
    int counter = 0;
    for (int i = LAST; 0 < i; --i)
    {
        for (int j = 0; j < i; ++j)
        {
            int next = j + 1;
            if (a[j] > a[next])
            {
                swap(a, j, next);
                ++counter;
            }
        }
    }
    return (counter);
}

int bubbleSortB(int * a)
{
    const int LAST = ARRAY_SIZE - 1;
    int counter = 0;
    for (int i = LAST; 0 < i; --i)
    {
        for (int j = 0; j < i; ++j)
        {
            int next = j + 1;
            if (a[j] >= a[next])
            {
                swap(a, j, next);
                ++counter;
            }
        }
    }
    return (counter);
}

int main()
{
    int * a = (int *) malloc(ARRAY_SIZE * sizeof(int));
    int * b = (int *) malloc(ARRAY_SIZE * sizeof(int));
    for (int run = 0; RUNS > run; ++run)
    {
        for (int idx = 0; ARRAY_SIZE > idx; ++idx)
        {
            a[idx] = std::rand() % LIMIT;
            b[idx] = a[idx];
        }

        std::cout << "Sorting with sortA: ";
        double start = omp_get_wtime();
        int swaps = bubbleSortA(a);

        std::cout << (omp_get_wtime() - start) << " seconds. It used " << swaps
                  << " swaps." << std::endl;

        std::cout << "Sorting with sortB: ";
        start = omp_get_wtime();
        swaps = bubbleSortB(b);

        std::cout << (omp_get_wtime() - start) << " seconds. It used " << swaps
                  << " swaps." << std::endl;
    }

    free(a);
    free(b);

    return (0);
}

This program shows the same behaviour. Can someone explain, what exactly is going on here?

Executing sortB first and then sortA does not change the results.


I think it may indeed be due to branch prediction. If you count the number of swaps compared to the number of inner sort iterations you find:

Limit = 10

  • A = 560M swaps / 1250M loops
  • B = 1250M swaps / 1250M loops (0.02% less swaps than loops)
  • Limit = 50000

  • A = 627M swaps / 1250M loops
  • B = 850M swaps / 1250M loops
  • So in the Limit == 10 case the swap is performed 99.98% of the time in the B sort which is obviously favourable for the branch predictor. In the Limit == 50000 case the swap is only hit randomly 68% so the branch predictor is less beneficial.


    I think this can indeed be explained by branch misprediction.

    Consider, for example, LIMIT=11, and sortB . On first iteration of the outer loop, it will very quickly stumble upon one of elements equal to 10. So it will have a[j]=10 , and therefore definitely a[j] will be >=a[next] , as there are no elements that are greater than 10. Therefore, it will perform a swap, then do one step in j only to find that a[j]=10 once again (the same swapped value). So once again it will be a[j]>=a[next] , and so one. Every comparison except several at the very beginning will be true. Similarly it will run on the next iterations of the outer loop.

    Not the same for sortA . It will start roughly the same way, stumble upon a[j]=10 , do some swaps in a similar manner, but only to a point when it finds a[next]=10 too. Then the condition will be false, and no swap will be done. An so on: every time it stumbles on a[next]=10 , the condition is false and no swaps are done. Therefore, this condition is true 10 times out of 11 (values of a[next] from 0 to 9), and false in 1 case out of 11. Nothing strange that branch prediction fails.


    Using the C++ code provided (time counting removed) with the perf stat command I got results that confirm the brach-miss theory.

    With Limit = 10 , BubbleSortB highly benefits from branch prediction (0.01% misses) but with Limit = 50000 branch prediction fails even more (with 15.65% misses) than in BubbleSortA (12.69% and 12.76% misses respectively).

    BubbleSortA Limit=10:

    Performance counter stats for './bubbleA.out':
    
       46670.947364 task-clock                #    0.998 CPUs utilized          
                 73 context-switches          #    0.000 M/sec                  
                 28 CPU-migrations            #    0.000 M/sec                  
                379 page-faults               #    0.000 M/sec                  
    117,298,787,242 cycles                    #    2.513 GHz                    
    117,471,719,598 instructions              #    1.00  insns per cycle        
     25,104,504,912 branches                  #  537.904 M/sec                  
      3,185,376,029 branch-misses             #   12.69% of all branches        
    
       46.779031563 seconds time elapsed
    

    BubbleSortA Limit=50000:

    Performance counter stats for './bubbleA.out':
    
       46023.785539 task-clock                #    0.998 CPUs utilized          
                 59 context-switches          #    0.000 M/sec                  
                  8 CPU-migrations            #    0.000 M/sec                  
                379 page-faults               #    0.000 M/sec                  
    118,261,821,200 cycles                    #    2.570 GHz                    
    119,230,362,230 instructions              #    1.01  insns per cycle        
     25,089,204,844 branches                  #  545.136 M/sec                  
      3,200,514,556 branch-misses             #   12.76% of all branches        
    
       46.126274884 seconds time elapsed
    

    BubbleSortB Limit=10:

    Performance counter stats for './bubbleB.out':
    
       26091.323705 task-clock                #    0.998 CPUs utilized          
                 28 context-switches          #    0.000 M/sec                  
                  2 CPU-migrations            #    0.000 M/sec                  
                379 page-faults               #    0.000 M/sec                  
     64,822,368,062 cycles                    #    2.484 GHz                    
    137,780,774,165 instructions              #    2.13  insns per cycle        
     25,052,329,633 branches                  #  960.179 M/sec                  
          3,019,138 branch-misses             #    0.01% of all branches        
    
       26.149447493 seconds time elapsed
    

    BubbleSortB Limit=50000:

    Performance counter stats for './bubbleB.out':
    
       51644.210268 task-clock                #    0.983 CPUs utilized          
              2,138 context-switches          #    0.000 M/sec                  
                 69 CPU-migrations            #    0.000 M/sec                  
                378 page-faults               #    0.000 M/sec                  
    144,600,738,759 cycles                    #    2.800 GHz                    
    124,273,104,207 instructions              #    0.86  insns per cycle        
     25,104,320,436 branches                  #  486.101 M/sec                  
      3,929,572,460 branch-misses             #   15.65% of all branches        
    
       52.511233236 seconds time elapsed
    
    链接地址: http://www.djcxy.com/p/31562.html

    上一篇: === vs ==运营商的表现

    下一篇: > vs.> =会导致显着的性能差异