为什么C ++比带boost的python快得多?
我的目标是在Python中编写一个用于光谱有限元的小型库,为此,我尝试使用Boost对C ++库进行扩展,希望能够使我的代码更快。
class Quad {
    public:
        Quad(int, int);
        double integrate(boost::function<double(std::vector<double> const&)> const&);
        double integrate_wrapper(boost::python::object const&);
        std::vector< std::vector<double> > nodes;
        std::vector<double> weights;
};
...
namespace std {
    typedef std::vector< std::vector< std::vector<double> > > cube;
    typedef std::vector< std::vector<double> > mat;
    typedef std::vector<double> vec;
}
...
double Quad::integrate(boost::function<double(vec const&)> const& func) {
    double result = 0.;
    for (unsigned int i = 0; i < nodes.size(); ++i) {
        result += func(nodes[i]) * weights[i];
    }
    return result;
}
// ---- PYTHON WRAPPER ----
double Quad::integrate_wrapper(boost::python::object const& func) {
    std::function<double(vec const&)> lambda;
    switch (this->nodes[0].size()) {
        case 1: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func (v[0])); }; break;
        case 2: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1])); }; break;
        case 3: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1], v[2])); }; break;
        default: cout << "Dimension must be 1, 2, or 3" << endl; exit(0);
    }
    return integrate(lambda);
}
// ---- EXPOSE TO PYTHON ----
BOOST_PYTHON_MODULE(hermite)
{
    using namespace boost::python;
    class_<std::vec>("double_vector")
        .def(vector_indexing_suite<std::vec>())
        ;
    class_<std::mat>("double_mat")
        .def(vector_indexing_suite<std::mat>())
        ;
    class_<Quad>("Quad", init<int,int>())
        .def("integrate", &Quad::integrate_wrapper)
        .def_readonly("nodes", &Quad::nodes)
        .def_readonly("weights", &Quad::weights)
        ;
}
我比较了三种不同方法的性能来计算两个函数的积分。 这两个功能是:
f1(x,y,z) = x*x f2(x,y,z) = np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) 使用的方法是:
从C ++程序中调用库:
double func(vector<double> v) {
    return F1_OR_F2;
}
int main() {
    hermite::Quad quadrature(100, 3);
    double result = quadrature.integrate(func);
    cout << "Result = " << result << endl;
}
从Python脚本调用库:
import hermite
def function(x, y, z): return F1_OR_F2
my_quad = hermite.Quad(100, 3)
result = my_quad.integrate(function)
  在Python中使用for循环: 
import hermite
def function(x, y, z): return F1_OR_F2
my_quad = hermite.Quad(100, 3)
weights = my_quad.weights
nodes = my_quad.nodes
result = 0.
for i in range(len(weights)):
    result += weights[i] * function(nodes[i][0], nodes[i][1], nodes[i][2])
  下面是每个方法的执行时间(时间使用方法1的time命令测量,方法2和3使用python模块time ,C ++代码使用Cmake和set (CMAKE_BUILD_TYPE Release)编译set (CMAKE_BUILD_TYPE Release) ) 
  对于f1 : 
0.07s user 0.01s system 99% cpu 0.083 total   对于f2 : 
0.28s user 0.01s system 99% cpu 0.289 total 根据这些结果,我的问题如下:
为什么第一种方法比第二种方法快得多?
python包装可以改进以达到方法1和2之间的可比性能吗?
为什么方法2比方法3更敏感地将函数集成的难度?
  编辑 :我也试着定义一个函数接受一个字符串作为参数,写入一个文件,然后继续编译该文件并动态加载生成的.so文件: 
double Quad::integrate_from_string(string const& function_body) {
    // Write function to file
    ofstream helper_file;
    helper_file.open("/tmp/helper_function.cpp");
    helper_file << "#include <vector>n#include <cmath>n";
    helper_file << "extern "C" double toIntegrate(std::vector<double> v) {n";
    helper_file << "    return " << function_body << ";n}";
    helper_file.close();
    // Compile file
    system("c++ /tmp/helper_function.cpp -o /tmp/helper_function.so -shared -fPIC");
    // Load function dynamically
    typedef double (*vec_func)(vec);
    void *function_so = dlopen("/tmp/helper_function.so", RTLD_NOW);
    vec_func func = (vec_func) dlsym(function_so, "toIntegrate");
    double result = integrate(func);
    dlclose(function_so);
    return result;
}
  它非常肮脏,可能不是很便携,所以我很乐意找到更好的解决方案,但它运行良好,可以很好地与ccode功能sympy 。 
第二次编辑我已经使用Numpy重写了纯Python中的函数。
import numpy as np
import numpy.polynomial.hermite_e as herm
import time
def integrate(function, degrees):
    dim = len(degrees)
    nodes_multidim = []
    weights_multidim = []
    for i in range(dim):
        nodes_1d, weights_1d = herm.hermegauss(degrees[i])
        nodes_multidim.append(nodes_1d)
        weights_multidim.append(weights_1d)
    grid_nodes = np.meshgrid(*nodes_multidim)
    grid_weights = np.meshgrid(*weights_multidim)
    nodes_flattened = []
    weights_flattened = []
    for i in range(dim):
        nodes_flattened.append(grid_nodes[i].flatten())
        weights_flattened.append(grid_weights[i].flatten())
    nodes = np.vstack(nodes_flattened)
    weights = np.prod(np.vstack(weights_flattened), axis=0)
    return np.dot(function(nodes), weights)
def function(v): return F1_OR_F2
result = integrate(function, [100,100,100])
print("-> Result = " + str(result) + ", Time = " + str(end-start))
  有点令人惊讶(至少对我来说),这种方法和纯粹的C ++实现之间在性能上没有显着差异。  特别是, f1需要0.059s, f2需要0.36s。 
另一种方法
用不那么一般的方式,你的问题可以更容易地解决。 你可以用纯Python代码编写集成和函数,并使用numba编译它。
第一种方法(第一次运行后每次集成运行0.025s(I7-4771))
该函数在第一次调用时编译,大约需要0.5s
function_2:
@nb.njit(fastmath=True)
def function_to_integrate(x,y,z):
return np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z)
积分
@nb.jit(fastmath=True)
def integrate3(num_int_Points):
  nodes_1d, weights_1d = herm.hermegauss(num_int_Points)
  result=0.
  for i in range(num_int_Points):
    for j in range(num_int_Points):
      result+=np.sum(function_to_integrate(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])
  return result
测试
import numpy as np
import numpy.polynomial.hermite_e as herm
import numba as nb
import time
t1=time.time()
nodes_1d, weights_1d = herm.hermegauss(num_int_Points)
for i in range(100):
  #result = integrate3(nodes_1d,weights_1d,100)
  result = integrate3(100) 
print(time.time()-t1)
print(result)
第二种方法
该函数也可以并行运行,当整合多个元素时,高斯点和权重可能只计算一次。 这将导致大约0.005秒的运行时间。
@nb.njit(fastmath=True,parallel=True)
def integrate3(nodes_1d,weights_1d,num_int_Points):
  result=0.
  for i in nb.prange(num_int_Points):
    for j in range(num_int_Points):
      result+=np.sum(function_to_integrate(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])
  return result
传递abitrary功能
import numpy as np
import numpy.polynomial.hermite_e as herm
import numba as nb
import time
def f(x,y,z):
  return np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z) +np.cos(2*x+2*y+2*z) + x*y + np.exp(-z*z)
def make_integrate3(f):
  f_jit=nb.njit(f,fastmath=True)
  @nb.njit(fastmath=True,parallel=True)
  def integrate_3(nodes_1d,weights_1d,num_int_Points):
      result=0.
      for i in nb.prange(num_int_Points):
        for j in range(num_int_Points):
          result+=np.sum(f_jit(nodes_1d[i],nodes_1d[j],nodes_1d[:])*weights_1d[i]*weights_1d[j]*weights_1d[:])
      return result
  return integrate_3
int_fun=make_integrate3(f)
num_int_Points=100
nodes_1d, weights_1d = herm.hermegauss(num_int_Points)
#Calling it the first time (takes about 1s)
result = int_fun(nodes_1d,weights_1d,100)
t1=time.time()
for i in range(100):
  result = int_fun(nodes_1d,weights_1d,100)
print(time.time()-t1)
print(result)
第一次调用后,使用Numba 0.38和Intel SVML大约需要0.002秒
  您的函数通过值来引导向量,这涉及到复制向量。  integrate_wrapper执行额外的副本。 
  通过引用接受boost::function并通过引用在这些lambda中捕获func也是有意义的。 
  将它们更改为(注意&和const& bits): 
double integrate(boost::function<double(std::vector<double> const&)> const&);
double Quad::integrate_wrapper(boost::python::object func) {
    std::function<double(vec const&)> lambda;
    switch (this->nodes[0].size()) {
        case 1: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func (v[0])); }; break;
        case 2: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1])); }; break;
        case 3: lambda = [&func](vec const& v) -> double { return boost::python::extract<double>(func(v[0], v[1], v[2])); }; break;
        default: cout << "Dimension must be 1, 2, or 3" << endl; exit(0);
    }
    return integrate(lambda);
}
尽管如此,从C ++调用Python函数比调用C ++函数更昂贵。
  人们通常在Python中使用numpy作为快速线性代数,它将SIMD用于许多常见操作。  在推出C ++实现之前,您应该首先考虑使用numpy 。  在C ++中,您必须使用Eigen上的英特尔MKL进行矢量化。 
