GNU gprof越野车是?

我有一个调用一个函数C程序pi_calcPiItem()通过函数6亿次pi_calcPiBlock 。 所以要分析在我使用GNU gprof的函数中花费的时间。 结果似乎是错误的,因为所有调用都归属于main() 。 此外,调用图没有任何意义:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls  Ts/call  Ts/call  name
 61.29      9.28     9.28                             pi_calcPiItem
 15.85     11.68     2.40                             pi_calcPiBlock
 11.96     13.49     1.81                             _mcount_private
  9.45     14.92     1.43                             __fentry__
  1.45     15.14     0.22                             pow
  0.00     15.14     0.00 600000000     0.00     0.00  main

                        Call graph


granularity: each sample hit covers 4 byte(s) for 0.07% of 15.14 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     61.3    9.28    0.00                 pi_calcPiItem [1]
-----------------------------------------------
                                                 <spontaneous>
[2]     15.9    2.40    0.00                 pi_calcPiBlock [2]
                0.00    0.00 600000000/600000000     main [6]
-----------------------------------------------
                                                 <spontaneous>
[3]     12.0    1.81    0.00                 _mcount_private [3]
-----------------------------------------------
                                                 <spontaneous>
[4]      9.4    1.43    0.00                 __fentry__ [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      1.5    0.22    0.00                 pow [5]
-----------------------------------------------
                                   6             main [6]
                0.00    0.00 600000000/600000000     pi_calcPiBlock [2]
[6]      0.0    0.00    0.00 600000000+6       main [6]
                                   6             main [6]
-----------------------------------------------

这是一个错误,还是我必须以某种方式配置程序?

<spontaneous>是什么意思?

编辑 (更多洞察你)

代码是关于pi的计算:

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

int32_t main(int32_t argc, char* argv[]) {
  double result;

  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %fn",result);
  return 0;
}

static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

这就是我如何得到结果(在Cygwin的帮助下在Windows上执行):

> gcc -std=c99 -o pi *.c -pg -fno-inline-small-functions
> ./pi.exe
> gprof.exe pi.exe

尝试:

  • 使用noinlinenoclone函数属性代替-fno-inline-small-functions
  • 通过拆解main我可以看到-fno-inline-small-functions不会停止内联
  • 静态链接你的程序( -static
  • 您还应该将result初始化为main 0.0
  • 这在Linux和x86-64上适用于我:

    #include <stdio.h>
    #include <stdint.h>
    #include <math.h>
    
    #define PI_BLOCKSIZE (100000000)
    #define PI_BLOCKCOUNT (6)
    #define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)
    
    static void pi_calcPiItem(double* piItem, int32_t index);
    static void pi_calcPiBlock(double* result, int32_t start, int32_t end);
    
    int32_t main(int32_t argc, char* argv[]) {
      double result;
    
      result = 0.0;
      for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
        pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
      }
    
      printf("pi = %fn",result);
      return 0;
    }
    
    __attribute__((noinline, noclone))
    static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
      double piItem;
    
      for ( int32_t i = start; i < end; ++i ) {
        pi_calcPiItem(&piItem, i);
        *result += piItem;
      }  
    }    
    
    __attribute__((noinline, noclone))
    static void pi_calcPiItem(double* piItem, int32_t index) {
      *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
    }
    

    编码

    $ cc pi.c -o pi -Os -Wall -g3 -I. -std=c99 -pg -static -lm
    

    产量

    $ ./pi && gprof ./pi
    pi = 3.141593
    Flat profile:
    
    Each sample counts as 0.01 seconds.
      %   cumulative   self              self     total           
     time   seconds   seconds    calls  ns/call  ns/call  name    
     85.61     22.55    22.55                             __ieee754_pow_sse2
      4.75     23.80     1.25                             pow
      4.14     24.89     1.09 600000000     1.82     1.82  pi_calcPiItem
      2.54     25.56     0.67                             __exp1
      0.91     25.80     0.24                             pi_calcPiBlock
      0.53     25.94     0.14                             matherr
      0.47     26.07     0.13                             __lseek_nocancel
      0.38     26.17     0.10                             frame_dummy
      0.34     26.26     0.09                             __ieee754_exp_sse2
      0.32     26.34     0.09                             __profile_frequency
      0.00     26.34     0.00        1     0.00     0.00  main
    
    
                 Call graph (explanation follows)
    
    
    granularity: each sample hit covers 2 byte(s) for 0.04% of 26.34 seconds
    
    index % time    self  children    called     name
                                                     <spontaneous>
    [1]     85.6   22.55    0.00                 __ieee754_pow_sse2 [1]
    -----------------------------------------------
                                                     <spontaneous>
    [2]      5.0    0.24    1.09                 pi_calcPiBlock [2]
                    1.09    0.00 600000000/600000000     pi_calcPiItem [4]
    -----------------------------------------------
                                                     <spontaneous>
    [3]      4.7    1.25    0.00                 pow [3]
    -----------------------------------------------
                    1.09    0.00 600000000/600000000     pi_calcPiBlock [2]
    [4]      4.1    1.09    0.00 600000000         pi_calcPiItem [4]
    -----------------------------------------------
                                                     <spontaneous>
    [5]      2.5    0.67    0.00                 __exp1 [5]
    -----------------------------------------------
                                                     <spontaneous>
    [6]      0.5    0.14    0.00                 matherr [6]
    -----------------------------------------------
                                                     <spontaneous>
    [7]      0.5    0.13    0.00                 __lseek_nocancel [7]
    -----------------------------------------------
                                                     <spontaneous>
    [8]      0.4    0.10    0.00                 frame_dummy [8]
    -----------------------------------------------
                                                     <spontaneous>
    [9]      0.3    0.09    0.00                 __ieee754_exp_sse2 [9]
    -----------------------------------------------
                                                     <spontaneous>
    [10]     0.3    0.09    0.00                 __profile_frequency [10]
    -----------------------------------------------
                    0.00    0.00       1/1           __libc_start_main [827]
    [11]     0.0    0.00    0.00       1         main [11]
    -----------------------------------------------
    

    注释

    正如所料, pow()是瓶颈。 在pi运行时, perf top (基于采样的系统分析器)也显示__ieee754_pow_sse2占CPU的60%以上。 @Mike Dunlavey建议将pow(-1.0,index)更改为((i & 1) ? -1.0 : 1.0)使代码大致快4倍。


    在'man gprof'页面中,这里是对“自发”的解释:

    不自己分析的父母将有时间将他们的异形儿传播给他们,但他们似乎会在调用图列表中自发地调用,并且不会将他们的时间进一步传播。 同样,信号捕捉器,即使是简介,似乎是自发的(尽管为了更加模糊的原因)。 除非信号捕获器在执行分析例程期间被调用,否则任何信号捕获器的异形子应该正确传播时间,在这种情况下全部丢失。

    链接地址: http://www.djcxy.com/p/40351.html

    上一篇: Is GNU gprof buggy?

    下一篇: gprof not showing call graph