kmcuda cuda build issues on Windows

Hi,

I am having troubles building CUDA files.

Following prerequisites were done:

platform toolset: Visual Studio 2015 (v140)
configuration type: Static library (.lib)
precompiled header: Not Using Precompiled Headers
target machine platform: 64-bit (--machine 64) (library was also set to x64 in build menu)
code generation: compute_61,sm_61;%(CodeGeneration)
additional library directories: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\lib\x64;%(AdditionalLibraryDirectories)
additional dependencies: cudart.lib;%(AdditionalDependencies)
Build customizations: CUDA 9.2
all files with .cu extension have CUDA C/C++ ItemType
master branch files are used
for the sake of simplicity knn related methods commented out. Only k-means is active
Python and R libraries removed from solution, hence are not causing any issues

Errors are following:

File: metric_abctraction.h Code:

	FPATTR static typename HALF<F>::type distance(
		F sqr1 __attribute__((unused)), F sqr2 __attribute__((unused)), F prod) {
		float fp = _float(_fin(prod));
		if (fp >= 1.f) return _half<F>(0.f);
		if (fp <= -1.f) return _half<F>(M_PI);
		return _half<F>(acos(fp));
	}

Error:

c:\users\lucky\source\repos\core\core3\metric_abstraction.h(172): error : expected a ")"

Comment: line with ((unused))

File: metric_abctraction.h Code:

	FPATTR static void normalize(uint32_t count __attribute__((unused)), float *vec) {
		// Kahan summation with inverted c
		float norm = 0, corr = 0;
#pragma unroll 4
		for (int f = 0; f < d_features_size; f++) {
			float v = vec[f];
			float y = _fma(corr, v, v);
			float t = norm + y;
			corr = y - (t - norm);
			norm = t;
		}
		norm = _reciprocal(_sqrt(norm));

#pragma unroll 4
		for (int f = 0; f < d_features_size; f++) {
			vec[f] = vec[f] * norm;
		}
	}

Error:

c:\users\lucky\source\repos\core\core3\metric_abstraction.h(255): error : expected a ")"

Comment: line with ((unused))

File: wrappers.h Code:

template <typename T>
class unique_devptr : public unique_devptr_parent<T> {
public:
	explicit unique_devptr(T *ptr, bool fake = false) : unique_devptr_parent<T>(
		ptr, fake ? [](T*) {} : [](T *p) { cudaFree(p); }) {}
};

Error:

1> c:\users\lucky\source\repos\core\core3\wrappers.h(20): error : more than one operator "?" matches these operands: 1> built-in operator "expression ? pointer : pointer" 1> built-in operator "expression ? pointer : pointer" 1> operand types are: lambda [](float *)->void : lambda [](float *)->void 1> detected during: 1> instantiation of "unique_devptr<T>::unique_devptr(T *, __nv_bool) [with T=float]"

Comment: line with ptr, fake ? [](T*) {} : [](T *p) { cudaFree(p); }) {}

File: metric_abstraction.h Code:

	FPATTR static float distance_t(const F *__restrict__ v1, const F *__restrict__ v2,
		uint64_t v1_size, uint64_t v1_index) {
		// Kahan summation with inverted c
		F prod = _const<F>(0), corr = _const<F>(0);
#pragma unroll 4
		for (uint64_t f = 0; f < d_features_size; f++) {
			F yprod = _fma(corr, v1[v1_size * f + v1_index], v2[f]);
			F tprod = _add(prod, yprod);
			corr = _sub(yprod, _sub(tprod, prod));
			prod = tprod;
		}
		return _float(distance(_const<F>(1), _const<F>(1), prod));
	}

Error:

1> c:\users\lucky\source\repos\core\core3\metric_abstraction.h(203): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list 1> argument types are: (float, float, float) 1> detected during: 1> instantiation of "float METRIC<(KMCUDADistanceMetric)1, F>::distance_t(const F *, const F *, uint64_t, uint64_t) [with F=float]"

Comment: line with return _float(distance(_const<F>(1), _const<F>(1), prod));

Similar errors in same file:

1> C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(269): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list 1> argument types are: (float, float, float) 1> detected during instantiation of "void kmeans_assign_lloyd_smallc<M,F>(uint32_t, uint32_t, const F *, const F *, uint32_t *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]" 1> (954): here 1> 1> C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(342): error : no instance of overloaded function "METRIC<(KMCUDADistanceMetric)1, F>::distance [with F=float]" matches the argument list 1> argument types are: (float, float, float) 1> detected during instantiation of "void kmeans_assign_lloyd<M,F>(uint32_t, uint32_t, const F *, const F *, uint32_t *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]" 1> (954): here

File: same Code:

FPATTR static void normalize(uint32_t count __attribute__((unused)), float *vec) {
		// Kahan summation with inverted c
		float norm = 0, corr = 0;
#pragma unroll 4
		for (int f = 0; f < d_features_size; f++) {
			float v = vec[f];
			float y = _fma(corr, v, v);
			float t = norm + y;
			corr = y - (t - norm);
			norm = t;
		}
		norm = _reciprocal(_sqrt(norm));

#pragma unroll 4
		for (int f = 0; f < d_features_size; f++) {
			vec[f] = vec[f] * norm;
		}
	}

Error:

1> c:\users\lucky\source\repos\core\core3\metric_abstraction.h(260): error : identifier "vec" is undefined 1> detected during: 1> instantiation of "void METRIC<(KMCUDADistanceMetric)1, F>::normalize(uint32_t) [with F=float]"

File: kmeans.cu Code:

template <KMCUDADistanceMetric M, typename F>
__global__ void kmeans_adjust(
    const uint32_t coffset, const uint32_t length,
    const F *__restrict__ samples,
    const uint32_t *__restrict__ assignments_prev,
    const uint32_t *__restrict__ assignments,
    F *__restrict__ centroids, uint32_t *__restrict__ ccounts) {
  uint32_t c = blockIdx.x * blockDim.x + threadIdx.x;
  if (c >= length) {
    return;
  }
  c += coffset;
  uint32_t my_count = ccounts[c];
  {
    F fmy_count = _const<F>(my_count);
    centroids += c * d_features_size;
    for (int f = 0; f < d_features_size; f++) {
      centroids[f] = _mul(centroids[f], fmy_count);
    }
  }
  extern __shared__ uint32_t ass[];
  int step = d_shmem_size / 2;
  F corr = _const<F>(0);
  for (uint32_t sbase = 0; sbase < d_samples_size; sbase += step) {
    __syncthreads();
    if (threadIdx.x == 0) {
      int pos = sbase;
      for (int i = 0; i < step && sbase + i < d_samples_size; i++) {
        ass[2 * i] = assignments[pos + i];
        ass[2 * i + 1] = assignments_prev[pos + i];
      }
    }
    __syncthreads();
    for (int i = 0; i < step && sbase + i < d_samples_size; i++) {
      uint32_t this_ass = ass[2 * i];
      uint32_t  prev_ass = ass[2 * i + 1];
      int sign = 0;
      if (prev_ass == c && this_ass != c) {
        sign = -1;
        my_count--;
      } else if (prev_ass != c && this_ass == c) {
        sign = 1;
        my_count++;
      }
      if (sign != 0) {
        F fsign = _const<F>(sign);
        #pragma unroll 4
        for (uint64_t f = 0; f < d_features_size; f++) {
          F centroid = centroids[f];
          F y = _fma(corr,
                     samples[static_cast<uint64_t>(d_samples_size) * f + sbase + i],
                     fsign);
          F t = _add(centroid, y);
          corr = _sub(y, _sub(t, centroid));
          centroids[f] = t;
        }
      }
    }
  }
  // my_count can be 0 => we get NaN with L2 and never use this cluster again
  // this is a feature, not a bug
  METRIC<M, F>::normalize(my_count, centroids);
  ccounts[c] = my_count;
}

Error:

1> C:/Users/Lucky/source/repos/Core/Core3/kmeans.cu(427): error : too many arguments in function call 1> detected during instantiation of "void kmeans_adjust<M,F>(uint32_t, uint32_t, const F *, const uint32_t *, const uint32_t *, F *, uint32_t *) [with M=kmcudaDistanceMetricCosine, F=float]" 1> (1002): here

Comment: error on line METRIC<M, F>::normalize(my_count, centroids);

File: tricks.cuh Code:

__device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) {
  int mask = ballot(1);
  int leader = __ffs(mask) - 1;
  uint32_t res;
  if ((threadIdx.x % warpSize) == leader) {
    res = atomicAdd(ctr, __popc(mask));
  }
  res = shfl(res, leader);
  return res + __popc(mask & ((1 << (threadIdx.x % warpSize)) - 1));
}

Error:

1> C:/Users/Lucky/source/repos/Core/Core3/tricks.cuh(31): error : identifier "shfl" is undefined

Fix for error 7: After I have added #include "private.h" into file - errors went away.. I wonder why wasn't this change committed? It seems like a very obvious fix and yet master branch does not include it.

Is CUDA project supposed to work at this moment? I would appreciate to hear out comments about errors that I have encountered.

Thanks in advance

Aug 01 '18 17:08 pavlexander

compiling c++ on windows

All the errors are obviously specific to MSVC++. The thing is, building this project has never been tested on Windows actually. I can add an Appveyor integration provided by somebody fixes these incompatibilities. Some of them are easy to fix, e.g. __attribute__, some of them are puzzling, e.g. the template errors. Anyway, I cannot do it myself in the upcoming months :(

Aug 01 '18 19:08 vmarkovtsev

I am not really c++ guy so lot's of code (95% of it) does not make any sense :)

Anyway, after hearing your answer - I started to dig into issues one by one, and resolved all of them.

Here are fixes that I have applied:

Fix. kmcuda.cc Before:

#if CUDA_ARCH < 60
  if (fp16x2) {
    INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
    return kmcudaInvalidArguments;
  }
#endif

if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
        INFO("compute capability mismatch for device %d: wanted %d.%d, have "
             "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
             "(refer to \"Building\" in README.md)\n",
             dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
             props.major * 10 + props.minor);
        devs.pop_back();

After:

/*#if CUDA_ARCH < 60
  if (fp16x2) {
    INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
    return kmcudaInvalidArguments;
  }
#endif*/

      /*if (props.major != (CUDA_ARCH / 10) || props.minor != (CUDA_ARCH % 10)) {
        INFO("compute capability mismatch for device %d: wanted %d.%d, have "
             "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
             "(refer to \"Building\" in README.md)\n",
             dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
             props.major * 10 + props.minor);
        devs.pop_back();
      }*/

Description: CUDA_ARCH variable is not available in host code. only CUDA can access it, so compile error was thrown.

Fix. metric_abstraction.h Before:

//
// distance and normalization functions.
//

After:

//
// distance and normalization functions.
//
#define _USE_MATH_DEFINES
#include <math.h>

Description: M_PI global variable was not available in scope of file, apparently <math.h> library contains it. The fix shall be added at the beginning of file. This is very important, apparently.

Fix. metric_abstraction.h Before:

  FPATTR static typename HALF<F>::type distance(
      F sqr1 __attribute__((unused)), F sqr2 __attribute__((unused)), F prod) {
    float fp = _float(_fin(prod));
    if (fp >= 1.f) return _half<F>(0.f);
    if (fp <= -1.f) return _half<F>(M_PI);
    return _half<F>(acos(fp));
  }

After

  FPATTR static typename HALF<F>::type distance(
      F sqr1, F sqr2, F prod) {
    float fp = _float(_fin(prod));
    if (fp >= 1.f) return _half<F>(0.f);
    if (fp <= -1.f) return _half<F>(M_PI);
    return _half<F>(acos(fp));
  }

Description: that's right, I just removed __attribute__((unused)). I have no idea what this attribute does, but, if isn't not used in any case - then how can an attribute make it worse? :) I don't know the consequences but many errors were fixed after this change. p.s. worth mentioning that I have removed all __attribute__((unused)) occurrences in the same file. Not just for 1 function.

Fix. tricks.cuh Before:

#include <cstdint>

After:

#include <cstdint>
#include "private.h"

Description: some functions from private.h are used, so.. must include it.

Fix. wrappers.h Before:

template <typename T>
class unique_devptr : public unique_devptr_parent<T> {
 public:
  explicit unique_devptr(T *ptr, bool fake = false) : unique_devptr_parent<T>(
      ptr, fake? [](T*){} : [](T *p){ cudaFree(p); }) {}
};

After:

template <typename T>
class unique_devptr : public unique_devptr_parent<T> {
public:
	explicit unique_devptr(T *ptr, bool fake = false) : unique_devptr_parent<T>(
		ptr, 
		(fake == true) 
		? (std::function<void(T*)>)([](T*) {})
		: [](T *p) { cudaFree(p); }
			) {}
};

Description: Apparently - an explicit type conversion is needed. A discussion was brought up on stack-overflow with similar issue. I have no idea what the problem is, but the fix is working as expected.

So, this seems to be it. I am yet to test if the code is working on GPU, but, at least the project gets compiled now.. If you have got any comments or concerns please share them :)

Aug 01 '18 20:08 pavlexander

p.s. code is workingon GPU just fine. Pull request will follow and possibly a separate wrapper project.

Aug 04 '18 17:08 pavlexander