Small array multiplication is slow
Benchmark results:
STATS FOR OPENBLAS
---------------------------------------------------------------
NUMBER OF THREADS: 1
NUMBER OF PROCESSORS: 6
CONFIG: OpenBLAS 0.3.13 NO_LAPACKE DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
CORE NAME: Haswell
PARALLEL: 1
2021-07-10T17:29:27+00:00
Running ./eigen_performance_test
Run on (6 X 2900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 32 KiB (x6)
L2 Unified 256 KiB (x6)
L3 Unified 12288 KiB (x6)
Load Average: 1.39, 0.97, 0.65
-------------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------------
eigen_matrix_vector_mult_test/2 52.0 ns 51.9 ns 10896682
eigen_matrix_vector_mult_test/4 45.3 ns 45.1 ns 15167031
eigen_matrix_vector_mult_test/8 58.9 ns 58.7 ns 11416469
eigen_matrix_vector_mult_test/16 97.1 ns 97.0 ns 6808424
eigen_matrix_vector_mult_test/32 225 ns 224 ns 3104947
eigen_matrix_vector_mult_test/64 708 ns 707 ns 982545
eigen_matrix_vector_mult_test/128 3247 ns 3241 ns 215232
eigen_matrix_vector_mult_test/256 17878 ns 17854 ns 40561
eigen_matrix_vector_mult_test/512 79920 ns 79654 ns 8723
eigen_matrix_vector_mult_test/1024 400534 ns 399784 ns 1756
eigen_matrix_matrix_mult_test/2 10.0 ns 9.98 ns 68861889
eigen_matrix_matrix_mult_test/4 7.77 ns 7.76 ns 88185892
eigen_matrix_matrix_mult_test/8 33.3 ns 33.3 ns 20416419
eigen_matrix_matrix_mult_test/16 221 ns 221 ns 3159655
eigen_matrix_matrix_mult_test/32 484 ns 484 ns 1371857
eigen_matrix_matrix_mult_test/64 2112 ns 2112 ns 318067
eigen_matrix_matrix_mult_test/128 14355 ns 14352 ns 48288
eigen_matrix_matrix_mult_test/256 61533 ns 61529 ns 11485
eigen_matrix_matrix_mult_test/512 276762 ns 276711 ns 2525
eigen_matrix_matrix_mult_test/1024 1389020 ns 1388564 ns 523
xtensor_matrix_matrix_mult_test/2 403 ns 403 ns 1692067
xtensor_matrix_matrix_mult_test/4 402 ns 402 ns 1746090
xtensor_matrix_matrix_mult_test/8 418 ns 417 ns 1674838
xtensor_matrix_matrix_mult_test/16 436 ns 436 ns 1562858
xtensor_matrix_matrix_mult_test/32 562 ns 561 ns 1202666
xtensor_matrix_matrix_mult_test/64 1056 ns 1056 ns 667352
xtensor_matrix_matrix_mult_test/128 3066 ns 3066 ns 222012
xtensor_matrix_matrix_mult_test/256 14641 ns 14595 ns 47534
xtensor_matrix_matrix_mult_test/512 56908 ns 56907 ns 11483
xtensor_matrix_matrix_mult_test/1024 352145 ns 351940 ns 2301
Benchmark code:
#include <benchmark/benchmark.h>
#include <Eigen/Dense>
#include <xtensor-blas/xlinalg.hpp>
#include <xtensor/xnoalias.hpp>
#include <xtensor/xtensor.hpp>
void blas_stats() {
std::cout << "\nSTATS FOR "
"OPENBLAS\n-----------------------------------------------------"
"----------\n\n";
openblas_set_num_threads(1);
std::cout << "NUMBER OF THREADS: " << openblas_get_num_threads()
<< std::endl;
// Get the number of physical processors (cores)
std::cout << "NUMBER OF PROCESSORS: " << openblas_get_num_procs()
<< std::endl;
// Get the build configure on runtime.
std::cout << "CONFIG: " << openblas_get_config()
<< std::endl;
/*Get the CPU corename on runtime.*/
std::cout << "CORE NAME: " << openblas_get_corename()
<< std::endl;
// Get the parallelization type which is used by OpenBLAS
std::cout << "PARALLEL: " << openblas_get_parallel()
<< std::endl;
std::cout << "\n\n";
}
static void eigen_matrix_vector_mult_test(benchmark::State &state) {
int size = state.range(0);
Eigen::MatrixXd m = Eigen::MatrixXd::Random(size, size);
Eigen::VectorXd v = Eigen::VectorXd::Random(size);
Eigen::MatrixXd r = Eigen::MatrixXd::Random(size, 1);
for (auto _ : state) {
r.noalias() = m * v;
}
}
BENCHMARK(eigen_matrix_vector_mult_test)->RangeMultiplier(2)->Range(2, 8 << 7);
static void eigen_matrix_matrix_mult_test(benchmark::State &state) {
int size = state.range(0);
Eigen::MatrixXd m = Eigen::MatrixXd::Random(size, size);
Eigen::MatrixXd v = Eigen::MatrixXd::Random(size, 1);
Eigen::MatrixXd r = Eigen::MatrixXd::Random(size, 1);
for (auto _ : state) {
r.noalias() = m * v;
}
}
BENCHMARK(eigen_matrix_matrix_mult_test)->RangeMultiplier(2)->Range(2, 8 << 7);
static void xtensor_matrix_matrix_mult_test(benchmark::State &state) {
uint64_t size = state.range(0);
xt::xarray<double>::shape_type shape_m = {size, size};
xt::xarray<double>::shape_type shape_v = {size};
xt::xarray<double> m(shape_m);
xt::xarray<double> v(shape_v);
xt::xarray<double> r(shape_v);
for (auto _ : state) {
xt::noalias(r) = xt::linalg::dot(m, v);
}
}
BENCHMARK(xtensor_matrix_matrix_mult_test)
->RangeMultiplier(2)
->Range(2, 8 << 7);
int main(int argc, char **argv) {
blas_stats();
benchmark::Initialize(&argc, argv);
if (benchmark::ReportUnrecognizedArguments(argc, argv))
return 1;
benchmark::RunSpecifiedBenchmarks();
}
Any thoughts why small arrays for xtensor incur so much overhead? what should I do if most of my arrays are small (but not fixed size).
I think there are two differences with eigen:
- we always call BLAS, where Eigen has it's own implementation that's optimized better for small matrices
- when you do a
dotoperation, there is some overhead to check if you're broadcasting
It could be interesting to make a 2D dot function that works faster on small matrices / matrix-vector combination. We could even use Eigen as a "backend" for that :)
It could be interesting to make a 2D dot function that works faster on small matrices / matrix-vector combination. We could even use Eigen as a "backend" for that :)
This is an interesting idea. First, two thoughts:
- Can you please confirm that the way I wrote the dot product is optimal? I built using these definitions:
add_definitions(-DHAVE_CBLAS=1 -DWITH_OPENBLAS=1 -DXTENSOR_ENABLE_XSIMD=1)set(LINK_LIBRARIES ${CONAN_LIBS} ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} xtensor xtensor::optimize xtensor::use_xsimd)set(RELEASE_FLAGS "-Ofast -DNDEBUG -march=native ${WARNING_FLAGS}") - Should I try to link with
MKLor do you think that will be just as slow for small matrices?
In case both options are a no go, yes I would love to see xtensor handle small matrix / matrix, matrix / vec dot products fast either via handrolled simd code or via eigen backend.
As I mentioned before, the dot product is more general than Eigen's matrix / matrix-vector multiplication as it performs broadcasting. Without broadcasting checks you'd probably already be 2x faster for small matrices.
You could try using xtensor as a class. It has a compile time dimension (just as MatrixXd is always 2-dimensionla) you can use xtensor<double, 2>.
MKL won't help much I am afraid :)
calling into BLAS functions has some intrinsic overhead (since even a function call vs. inlined cost has some cost attached to it).
Using xtensor class does not help either. As you say, the call into BLAS and broadcast checks is probably dominating the time with small matrices. Do you have some example code on how I can use xsimd to multiply a small matrix with a vector please?
Actually the problem is in other kinds of operations as well. Even simple array ops such as element wise array sums for xtensor based code is very slow compared to eigen when it comes to small arrays. Can we please look what bounds / broadcast checks are the culprit. Benchmark results:
Running ./array_ops_performance_test
Run on (6 X 2900 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 32 KiB (x6)
L2 Unified 256 KiB (x6)
L3 Unified 12288 KiB (x6)
Load Average: 0.02, 0.64, 0.73
----------------------------------------------------------------
Benchmark Time CPU Iterations
----------------------------------------------------------------
eigen_array_sum/2 0.857 ns 0.854 ns 840056658
eigen_array_sum/4 3.61 ns 3.61 ns 197734174
eigen_array_sum/8 7.89 ns 7.88 ns 82837277
eigen_array_sum/16 36.7 ns 36.7 ns 19514002
eigen_array_sum/32 153 ns 153 ns 4527008
eigen_array_sum/64 1117 ns 1115 ns 627337
eigen_array_sum/128 6368 ns 6357 ns 102095
eigen_array_sum/256 32093 ns 32039 ns 24341
eigen_array_sum/512 155964 ns 155738 ns 4763
xtensor_array_sum/2 133 ns 132 ns 5450911
xtensor_array_sum/4 142 ns 141 ns 4697384
xtensor_array_sum/8 150 ns 150 ns 4680743
xtensor_array_sum/16 249 ns 249 ns 2787851
xtensor_array_sum/32 544 ns 542 ns 1326147
xtensor_array_sum/64 1954 ns 1950 ns 343751
xtensor_array_sum/128 8772 ns 8759 ns 74394
xtensor_array_sum/256 35809 ns 35724 ns 19592
xtensor_array_sum/512 113890 ns 113503 ns 6039
#include <benchmark/benchmark.h>
#include <Eigen/Dense>
#include <xtensor/xarray.hpp>
using namespace Eigen;
static void eigen_array_sum(benchmark::State &state) {
int size = state.range(0);
MatrixXd a = MatrixXd::Random(size, size);
MatrixXd b = MatrixXd::Random(size, size);
MatrixXd r = MatrixXd::Random(size, size);
for (auto _ : state) {
r = a + b;
}
}
BENCHMARK(eigen_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);
static void xtensor_array_sum(benchmark::State &state) {
uint64_t size = state.range(0);
xt::xarray<double>::shape_type shape = {size, size};
xt::xarray<double> a(shape);
xt::xarray<double> b(shape);
xt::xarray<double> r(shape);
for (auto _ : state) {
r = a + b;
}
}
BENCHMARK(xtensor_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);
BENCHMARK_MAIN();
comparing MatrixXd and xt::xarray<double> is never a fair comparison. Quite different containers (since xarray is dynamically ndimensional, MatrixXd is statically 2d).
I'll update benchmark results using xtensor<double, 2>
-------------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------------
eigen_array_sum/2 1.29 ns 1.28 ns 457478259
eigen_array_sum/4 3.31 ns 3.29 ns 208422955
eigen_array_sum/8 6.83 ns 6.80 ns 96818924
eigen_array_sum/16 32.8 ns 32.6 ns 20155042
eigen_array_sum/32 138 ns 138 ns 4957244
eigen_array_sum/64 1009 ns 1008 ns 652397
eigen_array_sum/128 6065 ns 6060 ns 112739
eigen_array_sum/256 27283 ns 27275 ns 25480
eigen_array_sum/512 120261 ns 120255 ns 5317
xtensor_array_sum/2 34.4 ns 34.4 ns 20148667
xtensor_array_sum/4 37.2 ns 37.2 ns 18514938
xtensor_array_sum/8 50.0 ns 50.0 ns 12796309
xtensor_array_sum/16 139 ns 139 ns 4996273
xtensor_array_sum/32 392 ns 392 ns 1787989
xtensor_array_sum/64 1740 ns 1738 ns 406974
xtensor_array_sum/128 7859 ns 7858 ns 81830
xtensor_array_sum/256 32434 ns 32432 ns 21366
xtensor_array_sum/512 104275 ns 104264 ns 6603
static void eigen_array_sum(benchmark::State &state) {
int size = state.range(0);
MatrixXd a = MatrixXd::Random(size, size);
MatrixXd b = MatrixXd::Random(size, size);
MatrixXd r = MatrixXd::Random(size, size);
for (auto _ : state) {
r = a + b;
}
}
BENCHMARK(eigen_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);
static void xtensor_array_sum(benchmark::State &state) {
uint64_t size = state.range(0);
xt::xtensor<double, 2>::shape_type shape = {size, size};
xt::xtensor<double, 2> a(shape);
xt::xtensor<double, 2> b(shape);
xt::xtensor<double, 2> r(shape);
for (auto _ : state) {
r = a + b;
}
}
BENCHMARK(xtensor_array_sum)->RangeMultiplier(2)->Range(2, 8 << 6);
you can shave off some more ns by doing xt::noalias(r) = a + b;
There might be some similar trick for eigen.