Complexity measure needs to be able to present smaller values
While benchmarking the CUDA FFT library, I noticed that the minimum speed given by theBigO complexity measure is 0.01NlgN. This is a problem because when I try to benchmark the speed of the CUDA FFT library when including the fft_plan in the timing, it is an order of magnitude faster than not including the plan creation in the timing, and I cannot cleanly present that one is faster than the other.
Results:
-------------------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------------------
BM_R2CcuFFTWPlan/131072 454637 ns 454399 ns 1538
BM_R2CcuFFTWPlan/262144 688168 ns 687630 ns 946
BM_R2CcuFFTWPlan/524288 748108 ns 747737 ns 888
BM_R2CcuFFTWPlan/1048576 873088 ns 872584 ns 768
BM_R2CcuFFTWPlan/2097152 1106280 ns 1094416 ns 627
BM_R2CcuFFTWPlan/4194304 1532492 ns 1531535 ns 452
BM_R2CcuFFTWPlan/8388608 2412579 ns 2410158 ns 290
BM_R2CcuFFTWPlan/16777216 4169790 ns 4169792 ns 168
BM_R2CcuFFTWPlan/33554432 7691030 ns 7687242 ns 85
BM_R2CcuFFTWPlan/67108864 14719201 ns 14705280 ns 47
BM_R2CcuFFTWPlan/134217728 34119589 ns 34104306 ns 20
BM_R2CcuFFTWPlan/268435456 67600672 ns 67550141 ns 9
BM_R2CcuFFTWPlan/536870912 135257566 ns 135193994 ns 5
BM_R2CcuFFTWPlan_BigO 0.01 NlgN 0.01 NlgN
BM_R2CcuFFTWPlan_RMS 5 % 5 %
BM_R2CcuFFTNoPlan/131072 28320 ns 28320 ns 24627
BM_R2CcuFFTNoPlan/262144 46122 ns 46122 ns 13658
BM_R2CcuFFTNoPlan/524288 110148 ns 110053 ns 6193
BM_R2CcuFFTNoPlan/1048576 230731 ns 230638 ns 3013
BM_R2CcuFFTNoPlan/2097152 446010 ns 445790 ns 1384
BM_R2CcuFFTNoPlan/4194304 875786 ns 875366 ns 751
BM_R2CcuFFTNoPlan/8388608 1719761 ns 1719761 ns 399
BM_R2CcuFFTNoPlan/16777216 3403089 ns 3403088 ns 206
BM_R2CcuFFTNoPlan/33554432 6780352 ns 6777032 ns 95
BM_R2CcuFFTNoPlan/67108864 13519571 ns 13513831 ns 51
BM_R2CcuFFTNoPlan/134217728 32368661 ns 32344896 ns 22
BM_R2CcuFFTNoPlan/268435456 64653697 ns 64625494 ns 10
BM_R2CcuFFTNoPlan/536870912 130039288 ns 129979247 ns 5
BM_R2CcuFFTNoPlan_BigO 0.01 NlgN 0.01 NlgN
BM_R2CcuFFTNoPlan_RMS 4 % 4 %
Code:
#include <benchmark/benchmark.h>
#include <cufft.h>
static void BM_R2CcuFFTWPlan(benchmark::State& state)
{
int NX = state.range(0);
int BATCH = 1;
for (auto _ : state)
{
cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
if (cufftExecR2C(plan, (cufftReal*)data, data) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
return;
}
cufftDestroy(plan);
cudaFree(data);
}
state.SetComplexityN(state.range(0));
}
static void BM_R2CcuFFTNoPlan(benchmark::State& state)
{
int NX = state.range(0);
int BATCH = 1;
cufftHandle plan;
cufftComplex *data;
cudaMalloc((void**)&data, sizeof(cufftComplex)*(NX/2+1)*BATCH);
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
return;
}
if (cufftPlan1d(&plan, NX, CUFFT_R2C, BATCH) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
return;
}
for (auto _ : state)
{
if (cufftExecR2C(plan, (cufftReal*)data, data) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
return;
}
benchmark::DoNotOptimize(cudaDeviceSynchronize());
}
cufftDestroy(plan);
cudaFree(data);
state.SetComplexityN(state.range(0));
}
BENCHMARK(BM_R2CcuFFTWPlan)->RangeMultiplier(2)->Range(131072, 1<<29)->Complexity(benchmark::oNLogN);
BENCHMARK(BM_R2CcuFFTNoPlan)->RangeMultiplier(2)->Range(131072, 1<<29)->Complexity(benchmark::oNLogN);
BENCHMARK_MAIN();
I've gone over the code for complexity and i don't see anything obvious that would artificially restrict the precision of the factor.
https://github.com/google/benchmark/blob/d70417994a3c845c49c4443e92b26a52b320a759/src/console_reporter.cc#L135 might be the issue, so you could try changing that if you're building benchmark from source.
i'm not sure exactly what the right answer is, but it does seem like restricting the accuracy in the output could lead to issues. open to a PR here.