GNU gprof越野车是？

2018-06-14 04:10:25

我有一个调用一个函数C程序pi_calcPiItem()通过函数6亿次pi_calcPiBlock 。所以要分析在我使用GNU gprof的函数中花费的时间。结果似乎是错误的，因为所有调用都归属于main() 。此外，调用图没有任何意义：

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total
 time   seconds   seconds    calls  Ts/call  Ts/call  name
 61.29      9.28     9.28                             pi_calcPiItem
 15.85     11.68     2.40                             pi_calcPiBlock
 11.96     13.49     1.81                             _mcount_private
  9.45     14.92     1.43                             __fentry__
  1.45     15.14     0.22                             pow
  0.00     15.14     0.00 600000000     0.00     0.00  main

                        Call graph


granularity: each sample hit covers 4 byte(s) for 0.07% of 15.14 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     61.3    9.28    0.00                 pi_calcPiItem [1]
-----------------------------------------------
                                                 <spontaneous>
[2]     15.9    2.40    0.00                 pi_calcPiBlock [2]
                0.00    0.00 600000000/600000000     main [6]
-----------------------------------------------
                                                 <spontaneous>
[3]     12.0    1.81    0.00                 _mcount_private [3]
-----------------------------------------------
                                                 <spontaneous>
[4]      9.4    1.43    0.00                 __fentry__ [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      1.5    0.22    0.00                 pow [5]
-----------------------------------------------
                                   6             main [6]
                0.00    0.00 600000000/600000000     pi_calcPiBlock [2]
[6]      0.0    0.00    0.00 600000000+6       main [6]
                                   6             main [6]
-----------------------------------------------

这是一个错误，还是我必须以某种方式配置程序？

而<spontaneous>是什么意思？

编辑（更多洞察你）

代码是关于pi的计算：

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

int32_t main(int32_t argc, char* argv[]) {
  double result;

  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %fn",result);
  return 0;
}

static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

这就是我如何得到结果（在Cygwin的帮助下在Windows上执行）：

> gcc -std=c99 -o pi *.c -pg -fno-inline-small-functions
> ./pi.exe
> gprof.exe pi.exe

尝试：

使用noinline ， noclone函数属性代替-fno-inline-small-functions

通过拆解main我可以看到-fno-inline-small-functions不会停止内联

静态链接你的程序（ -static ）

您还应该将result初始化为main 0.0

这在Linux和x86-64上适用于我：

#include <stdio.h>
#include <stdint.h>
#include <math.h>

#define PI_BLOCKSIZE (100000000)
#define PI_BLOCKCOUNT (6)
#define PI_THRESHOLD (PI_BLOCKSIZE * PI_BLOCKCOUNT)

static void pi_calcPiItem(double* piItem, int32_t index);
static void pi_calcPiBlock(double* result, int32_t start, int32_t end);

int32_t main(int32_t argc, char* argv[]) {
  double result;

  result = 0.0;
  for ( int32_t i = 0; i < PI_THRESHOLD; i += PI_BLOCKSIZE ) {
    pi_calcPiBlock(&result, i, i + PI_BLOCKSIZE);
  }

  printf("pi = %fn",result);
  return 0;
}

__attribute__((noinline, noclone))
static void pi_calcPiBlock(double* result, int32_t start, int32_t end) {
  double piItem;

  for ( int32_t i = start; i < end; ++i ) {
    pi_calcPiItem(&piItem, i);
    *result += piItem;
  }  
}    

__attribute__((noinline, noclone))
static void pi_calcPiItem(double* piItem, int32_t index) {
  *piItem = 4.0 * (pow(-1.0,index) / (2.0 * index + 1.0));
}

编码

$ cc pi.c -o pi -Os -Wall -g3 -I. -std=c99 -pg -static -lm

产量

$ ./pi && gprof ./pi
pi = 3.141593
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ns/call  ns/call  name    
 85.61     22.55    22.55                             __ieee754_pow_sse2
  4.75     23.80     1.25                             pow
  4.14     24.89     1.09 600000000     1.82     1.82  pi_calcPiItem
  2.54     25.56     0.67                             __exp1
  0.91     25.80     0.24                             pi_calcPiBlock
  0.53     25.94     0.14                             matherr
  0.47     26.07     0.13                             __lseek_nocancel
  0.38     26.17     0.10                             frame_dummy
  0.34     26.26     0.09                             __ieee754_exp_sse2
  0.32     26.34     0.09                             __profile_frequency
  0.00     26.34     0.00        1     0.00     0.00  main


             Call graph (explanation follows)


granularity: each sample hit covers 2 byte(s) for 0.04% of 26.34 seconds

index % time    self  children    called     name
                                                 <spontaneous>
[1]     85.6   22.55    0.00                 __ieee754_pow_sse2 [1]
-----------------------------------------------
                                                 <spontaneous>
[2]      5.0    0.24    1.09                 pi_calcPiBlock [2]
                1.09    0.00 600000000/600000000     pi_calcPiItem [4]
-----------------------------------------------
                                                 <spontaneous>
[3]      4.7    1.25    0.00                 pow [3]
-----------------------------------------------
                1.09    0.00 600000000/600000000     pi_calcPiBlock [2]
[4]      4.1    1.09    0.00 600000000         pi_calcPiItem [4]
-----------------------------------------------
                                                 <spontaneous>
[5]      2.5    0.67    0.00                 __exp1 [5]
-----------------------------------------------
                                                 <spontaneous>
[6]      0.5    0.14    0.00                 matherr [6]
-----------------------------------------------
                                                 <spontaneous>
[7]      0.5    0.13    0.00                 __lseek_nocancel [7]
-----------------------------------------------
                                                 <spontaneous>
[8]      0.4    0.10    0.00                 frame_dummy [8]
-----------------------------------------------
                                                 <spontaneous>
[9]      0.3    0.09    0.00                 __ieee754_exp_sse2 [9]
-----------------------------------------------
                                                 <spontaneous>
[10]     0.3    0.09    0.00                 __profile_frequency [10]
-----------------------------------------------
                0.00    0.00       1/1           __libc_start_main [827]
[11]     0.0    0.00    0.00       1         main [11]
-----------------------------------------------

注释

正如所料， pow()是瓶颈。在pi运行时， perf top （基于采样的系统分析器）也显示__ieee754_pow_sse2占CPU的60％以上。 @Mike Dunlavey建议将pow(-1.0,index)更改为((i & 1) ? -1.0 : 1.0)使代码大致快4倍。

在'man gprof'页面中，这里是对“自发”的解释：

不自己分析的父母将有时间将他们的异形儿传播给他们，但他们似乎会在调用图列表中自发地调用，并且不会将他们的时间进一步传播。同样，信号捕捉器，即使是简介，似乎是自发的（尽管为了更加模糊的原因）。除非信号捕获器在执行分析例程期间被调用，否则任何信号捕获器的异形子应该正确传播时间，在这种情况下全部丢失。

链接地址: http://www.djcxy.com/p/40351.html

上一篇: Is GNU gprof buggy?

下一篇: gprof not showing call graph