nvidia
NAME | WHAT | SOURCE | HEADER | --- |
---|---|---|---|---|
CUDA | Compute Unified Device Architecture | N | cuda.h | libcuda.so |
cuBLAS | Basic Linear Algebra Subroutine | N | cublas_v2.h | cublas.so |
CUTLASS | CUDA Templates for Linear Algebra Subroutines | Y | cutlass/cutlass.h | |
CUB | Cooperative primitives for CUDA C++ | Y | cub/cub.cuh | |
Thrust | The C++ Parallel Algorithms Library | Y | thrust/.h | |
cuSOLVER | LAPACK | N | cusolverDn.h | libcusolver.so |
CUDA
一般 CUDA 指 CUDA Toolkit
#include "cuda.h"
#include "cuda_runtime.h"
#include <iostream>
int main() {
int rt_ver, dr_ver;
auto err1 = cudaRuntimeGetVersion(&rt_ver);
if (err1 != cudaSuccess) {
std::cerr << cudaGetErrorString(err1) << std::endl;
return -1;
}
std::cout << "Runtime version: " << rt_ver << std::endl;
auto err2 = cudaDriverGetVersion(&dr_ver);
if (err2 != cudaSuccess) {
std::cerr << cudaGetErrorString(err2) << std::endl;
return -1;
}
std::cout << "Driver version: " << dr_ver << std::endl;
return 0;
}
cuBLAS
CUDA Basic Linear Algebra Subroutine library
CUTLASS
利用cutlass的warp和block层级的接口实现一个大的attention kernel为例, 难点主要是需要熟练使用cutlass,并能进行调优,而采用cutlass的开发有以下4个难点:
- cutlass并不是一个如cublas那样使用简单的GEMM库。cutlass仅提供threads block, warp, instruction各个级别的原语,开发者需要熟悉cutlass源码,并根据需求组装原语实现kernel。
- cutlass使用了大量的模版,在debug过程中需要阅读很长的log,且基本无法使用代码跳转工具,会给开发带来一定困难。
- cutlass并不提供自动调优的功能,需要开发者手动选择最优的设置,或者接入框架的自动调优中。
- 由于可能的组合非常多,基于cutlass进行开发时通常需要编写自动生成kernel的脚本。
从torch的实现上可以看出该工作需要的工作量还是比较大.
CUB
#include <cub/cub.cuh>
// Block-sorting CUDA kernel
__global__ void BlockSortKernel(int *d_in, int *d_out)
{
using namespace cub;
// Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads
// owning 16 integer items each
typedef BlockRadixSort<int, 128, 16> BlockRadixSort;
typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE> BlockLoad;
typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
// Allocate shared memory
__shared__ union {
typename BlockRadixSort::TempStorage sort;
typename BlockLoad::TempStorage load;
typename BlockStore::TempStorage store;
} temp_storage;
int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment
// Obtain a segment of 2048 consecutive keys that are blocked across threads
int thread_keys[16];
BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
__syncthreads();
// Collectively sort the keys
BlockRadixSort(temp_storage.sort).Sort(thread_keys);
__syncthreads();
// Store the sorted segment
BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
}
cuSolver
cuSolverDN: Dense LAPACK
, solve .
cuSolverSP: Sparse LAPACK
, solve
cuSolverRF: Refactorization
, solve .
Thrust
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
#include <thrust/random.h>
int main() {
// Generate 32M random numbers serially.
thrust::default_random_engine rng(1337);
thrust::uniform_int_distribution<int> dist;
thrust::host_vector<int> h_vec(32 << 20);
thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
// Transfer data to the device.
thrust::device_vector<int> d_vec = h_vec;
// Sort data on the device.
thrust::sort(d_vec.begin(), d_vec.end());
// Transfer data back to host.
thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
}
Comparasion
NVIDIA HPC SDK vs CUDA Toolkit
cuda.h vs cuda_runtime.h