Changes to support build with CUDA 11

pradeep · pradeep · commit 2bcb476ca42e · 2020-06-26T15:16:43.000+05:30
Also, updates CUB version from 1.8.0 to 1.9.10
diff --git a/.gitmodules b/.gitmodules
@@ -10,8 +10,8 @@
 [submodule "src/backend/cpu/threads"]
 	path = src/backend/cpu/threads
 	url = https://github.com/alltheflops/threads.git
-[submodule "src/backend/cuda/cub"]
-	path = src/backend/cuda/cub
+[submodule "extern/cub"]
+	path = extern/cub
 	url = https://github.com/NVlabs/cub.git
 [submodule "extern/spdlog"]
 	path = extern/spdlog
diff --git a/extern/cub b/extern/cub
@@ -0,0 +1 @@
+Subproject commit d106ddb991a56c3df1b6d51b2409e36ba8181ce4
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
@@ -101,11 +101,13 @@ cuda_include_directories(
   ${ArrayFire_BINARY_DIR}/include
   ${CMAKE_CURRENT_SOURCE_DIR}/kernel
   ${CMAKE_CURRENT_SOURCE_DIR}/jit
-  ${CMAKE_CURRENT_SOURCE_DIR}/cub
   ${ArrayFire_SOURCE_DIR}/src/api/c
   ${ArrayFire_SOURCE_DIR}/src/backend
   ${COMMON_INTERFACE_DIRS}
   )
+if(CUDA_VERSION_MAJOR VERSION_LESS 11)
+  cuda_include_directories(${ArrayFire_SOURCE_DIR}/extern/cub)
+endif()
 
 file(GLOB jit_src "kernel/jit.cuh")
 
@@ -679,6 +681,12 @@ target_include_directories (afcuda
     ${CMAKE_CURRENT_BINARY_DIR}
 )
 
+if(CUDA_VERSION_MAJOR VERSION_LESS 11)
+  target_include_directories(afcuda PRIVATE ${ArrayFire_SOURCE_DIR}/extern/cub)
+  target_include_directories(af_cuda_static_cuda_library
+    PRIVATE ${ArrayFire_SOURCE_DIR}/extern/cub)
+endif()
+
 target_link_libraries(afcuda
   PRIVATE
     c_api_interface
diff --git a/src/backend/cuda/cub b/src/backend/cuda/cub
diff --git a/src/backend/cuda/sparse.cu b/src/backend/cuda/sparse.cu
@@ -28,24 +28,6 @@ namespace cuda {
 
 using namespace common;
 
-// cusparseStatus_t cusparseZcsr2csc(cusparseHandle_t handle,
-//                                  int m, int n, int nnz,
-//                                  const cuDoubleComplex *csrSortedVal,
-//                                  const int *csrSortedRowPtr, const int
-//                                  *csrSortedColInd, cuDoubleComplex
-//                                  *cscSortedVal, int *cscSortedRowInd, int
-//                                  *cscSortedColPtr, cusparseAction_t
-//                                  copyValues, cusparseIndexBase_t idxBase);
-
-template<typename T>
-struct csr2csc_func_def_t {
-    typedef cusparseStatus_t (*csr2csc_func_def)(cusparseHandle_t, int, int,
-                                                 int, const T *, const int *,
-                                                 const int *, T *, int *, int *,
-                                                 cusparseAction_t,
-                                                 cusparseIndexBase_t);
-};
-
 // cusparseStatus_t cusparseZdense2csr(cusparseHandle_t handle,
 //                                    int m, int n,
 //                                    const cusparseMatDescr_t descrA,
@@ -144,12 +126,6 @@ struct gthr_func_def_t {
                cusparse##PREFIX##FUNC;                                      \
     }
 
-SPARSE_FUNC_DEF(csr2csc)
-SPARSE_FUNC(csr2csc, float, S)
-SPARSE_FUNC(csr2csc, double, D)
-SPARSE_FUNC(csr2csc, cfloat, C)
-SPARSE_FUNC(csr2csc, cdouble, Z)
-
 SPARSE_FUNC_DEF(dense2csr)
 SPARSE_FUNC(dense2csr, float, S)
 SPARSE_FUNC(dense2csr, double, D)
diff --git a/src/backend/cuda/sparse_arith.cu b/src/backend/cuda/sparse_arith.cu
@@ -111,6 +111,60 @@ SparseArray<T> arithOp(const SparseArray<T> &lhs, const Array<T> &rhs,
     return out;
 }
 
+#define SPARSE_ARITH_OP_FUNC_DEF(FUNC) \
+    template<typename T>               \
+    FUNC##_def<T> FUNC##_func();
+
+#define SPARSE_ARITH_OP_FUNC(FUNC, TYPE, INFIX) \
+    template<>                                  \
+    FUNC##_def<TYPE> FUNC##_func<TYPE>() {      \
+        return cusparse##INFIX##FUNC;           \
+    }
+
+#if CUDA_VERSION >= 11000
+
+template<typename T>
+using csrgeam2_buffer_size_def = cusparseStatus_t (*)(
+    cusparseHandle_t, int, int, const T *, const cusparseMatDescr_t, int,
+    const T *, const int *, const int *, const T *, const cusparseMatDescr_t,
+    int, const T *, const int *, const int *, const cusparseMatDescr_t,
+    const T *, const int *, const int *, size_t *);
+
+#define SPARSE_ARITH_OP_BUFFER_SIZE_FUNC_DEF(FUNC) \
+    template<typename T>                           \
+    FUNC##_buffer_size_def<T> FUNC##_buffer_size_func();
+
+SPARSE_ARITH_OP_BUFFER_SIZE_FUNC_DEF(csrgeam2);
+
+#define SPARSE_ARITH_OP_BUFFER_SIZE_FUNC(FUNC, TYPE, INFIX)        \
+    template<>                                                     \
+    FUNC##_buffer_size_def<TYPE> FUNC##_buffer_size_func<TYPE>() { \
+        return cusparse##INFIX##FUNC##_bufferSizeExt;              \
+    }
+
+SPARSE_ARITH_OP_BUFFER_SIZE_FUNC(csrgeam2, float, S);
+SPARSE_ARITH_OP_BUFFER_SIZE_FUNC(csrgeam2, double, D);
+SPARSE_ARITH_OP_BUFFER_SIZE_FUNC(csrgeam2, cfloat, C);
+SPARSE_ARITH_OP_BUFFER_SIZE_FUNC(csrgeam2, cdouble, Z);
+
+template<typename T>
+using csrgeam2_def = cusparseStatus_t (*)(cusparseHandle_t, int, int, const T *,
+                                          const cusparseMatDescr_t, int,
+                                          const T *, const int *, const int *,
+                                          const T *, const cusparseMatDescr_t,
+                                          int, const T *, const int *,
+                                          const int *, const cusparseMatDescr_t,
+                                          T *, int *, int *, void *);
+
+SPARSE_ARITH_OP_FUNC_DEF(csrgeam2);
+
+SPARSE_ARITH_OP_FUNC(csrgeam2, float, S);
+SPARSE_ARITH_OP_FUNC(csrgeam2, double, D);
+SPARSE_ARITH_OP_FUNC(csrgeam2, cfloat, C);
+SPARSE_ARITH_OP_FUNC(csrgeam2, cdouble, Z);
+
+#else
+
 template<typename T>
 using csrgeam_def = cusparseStatus_t (*)(cusparseHandle_t, int, int, const T *,
                                          const cusparseMatDescr_t, int,
@@ -120,23 +174,15 @@ using csrgeam_def = cusparseStatus_t (*)(cusparseHandle_t, int, int, const T *,
                                          const int *, const cusparseMatDescr_t,
                                          T *, int *, int *);
 
-#define SPARSE_ARITH_OP_FUNC_DEF(FUNC) \
-    template<typename T>               \
-    FUNC##_def<T> FUNC##_func();
-
 SPARSE_ARITH_OP_FUNC_DEF(csrgeam);
 
-#define SPARSE_ARITH_OP_FUNC(FUNC, TYPE, INFIX) \
-    template<>                                  \
-    FUNC##_def<TYPE> FUNC##_func<TYPE>() {      \
-        return cusparse##INFIX##FUNC;           \
-    }
-
 SPARSE_ARITH_OP_FUNC(csrgeam, float, S);
 SPARSE_ARITH_OP_FUNC(csrgeam, double, D);
 SPARSE_ARITH_OP_FUNC(csrgeam, cfloat, C);
 SPARSE_ARITH_OP_FUNC(csrgeam, cdouble, Z);
 
+#endif
+
 template<typename T, af_op_t op>
 SparseArray<T> arithOp(const SparseArray<T> &lhs, const SparseArray<T> &rhs) {
     lhs.eval();
@@ -163,9 +209,28 @@ SparseArray<T> arithOp(const SparseArray<T> &lhs, const SparseArray<T> &rhs) {
     int baseC, nnzC;
     int *nnzcDevHostPtr = &nnzC;
 
+    T alpha = scalar<T>(1);
+    T beta  = op == af_sub_t ? scalar<T>(-1) : alpha;
+
+#if CUDA_VERSION >= 11000
+    size_t pBufferSize = 0;
+
+    csrgeam2_buffer_size_func<T>()(
+        sparseHandle(), M, N, &alpha, desc, nnzA, lhs.getValues().get(),
+        csrRowPtrA, csrColPtrA, &beta, desc, nnzB, rhs.getValues().get(),
+        csrRowPtrB, csrColPtrB, desc, NULL, csrRowPtrC, NULL, &pBufferSize);
+
+    auto tmpBuffer = createEmptyArray<char>(dim4(pBufferSize));
+
+    CUSPARSE_CHECK(cusparseXcsrgeam2Nnz(
+        sparseHandle(), M, N, desc, nnzA, csrRowPtrA, csrColPtrA, desc, nnzB,
+        csrRowPtrB, csrColPtrB, desc, csrRowPtrC, nnzcDevHostPtr,
+        tmpBuffer.get()));
+#else
     CUSPARSE_CHECK(cusparseXcsrgeamNnz(
         sparseHandle(), M, N, desc, nnzA, csrRowPtrA, csrColPtrA, desc, nnzB,
         csrRowPtrB, csrColPtrB, desc, csrRowPtrC, nnzcDevHostPtr));
+#endif
     if (NULL != nnzcDevHostPtr) {
         nnzC = *nnzcDevHostPtr;
     } else {
@@ -181,15 +246,18 @@ SparseArray<T> arithOp(const SparseArray<T> &lhs, const SparseArray<T> &rhs) {
 
     auto outColIdx = createEmptyArray<int>(dim4(nnzC));
     auto outValues = createEmptyArray<T>(dim4(nnzC));
-
-    T alpha = scalar<T>(1);
-    T beta  = op == af_sub_t ? scalar<T>(-1) : alpha;
-
+#if CUDA_VERSION >= 11000
+    csrgeam2_func<T>()(sparseHandle(), M, N, &alpha, desc, nnzA,
+                       lhs.getValues().get(), csrRowPtrA, csrColPtrA, &beta,
+                       desc, nnzB, rhs.getValues().get(), csrRowPtrB,
+                       csrColPtrB, desc, outValues.get(), csrRowPtrC,
+                       outColIdx.get(), tmpBuffer.get());
+#else
     csrgeam_func<T>()(sparseHandle(), M, N, &alpha, desc, nnzA,
                       lhs.getValues().get(), csrRowPtrA, csrColPtrA, &beta,
                       desc, nnzB, rhs.getValues().get(), csrRowPtrB, csrColPtrB,
                       desc, outValues.get(), csrRowPtrC, outColIdx.get());
-
+#endif
     SparseArray<T> retVal = createArrayDataSparseArray(
         ldims, outValues, outRowIdx, outColIdx, sfmt);
     return retVal;