文章目录
- 一、理解CUDA的grid和Block
- 1)第一个cuda项目
 
- 二、理解.cu和.cpp的相互引用及Makefile
- 三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取
- 1)矩阵乘法
- 2)Error Handle
- 3)硬件信息获取
 
- 四、安装Nsight system and compute
- 五、共享内存、Bank Conflict原因和解决方法、TRT用Cuda进行预处理/后处理来加速、Stream 与Event(用Cuda写流提高并发性)
- 六、双线性插值与仿射变换
 
一、理解CUDA的grid和Block
- 目标
 理解Cuda中一维、二维、三维的grid、block的写法,以及遍历thread的方法
1)第一个cuda项目
- 修改项目的Makefile.config
  
- 总体文件目录
  
- 代码
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void print_idx_kernel(){
    printf("block idx: (%3d, %3d, %3d), thread idx: (%3d, %3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         threadIdx.z, threadIdx.y, threadIdx.x);
}
__global__ void print_dim_kernel(){
    printf("grid dimension: (%3d, %3d, %3d), block dimension: (%3d, %3d, %3d)\n",
         gridDim.z, gridDim.y, gridDim.x,
         blockDim.z, blockDim.y, blockDim.x);
}
__global__ void print_thread_idx_per_block_kernel(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;
    printf("block idx: (%3d, %3d, %3d), thread idx: %3d\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index);
}
__global__ void print_thread_idx_per_grid_kernel(){
    int bSize  = blockDim.z * blockDim.y * blockDim.x;
    int bIndex = blockIdx.z * gridDim.x * gridDim.y + \
               blockIdx.y * gridDim.x + \
               blockIdx.x;
    int tIndex = threadIdx.z * blockDim.x * blockDim.y + \
               threadIdx.y * blockDim.x + \
               threadIdx.x;
    int index  = bIndex * bSize + tIndex;
    printf("block idx: %3d, thread idx in block: %3d, thread idx: %3d\n", 
         bIndex, tIndex, index);
}
__global__ void print_cord_kernel(){
    int index = threadIdx.z * blockDim.x * blockDim.y + \
              threadIdx.y * blockDim.x + \
              threadIdx.x;
    int x  = blockIdx.x * blockDim.x + threadIdx.x;
    int y  = blockIdx.y * blockDim.y + threadIdx.y;
    printf("block idx: (%3d, %3d, %3d), thread idx: %3d, cord: (%3d, %3d)\n",
         blockIdx.z, blockIdx.y, blockIdx.x,
         index, x, y);
}
void print_one_dim(){
    int inputSize = 8;
    int blockDim = 4;
    int gridDim = inputSize / blockDim;
    dim3 block(blockDim);
    dim3 grid(gridDim);
    /* 这里建议大家吧每一函数都试一遍*/
    // print_idx_kernel<<<grid, block>>>();
    // print_dim_kernel<<<grid, block>>>();
    // print_thread_idx_per_block_kernel<<<grid, block>>>();
    print_thread_idx_per_grid_kernel<<<grid, block>>>();
    cudaDeviceSynchronize();
}
void print_two_dim(){
    int inputWidth = 4;
    int blockDim = 2;
    int gridDim = inputWidth / blockDim;
    dim3 block(blockDim, blockDim);
    dim3 grid(gridDim, gridDim);
    /* 这里建议大家吧每一函数都试一遍*/
    // print_idx_kernel<<<grid, block>>>();
    // print_dim_kernel<<<grid, block>>>();
    // print_thread_idx_per_block_kernel<<<grid, block>>>();
    print_thread_idx_per_grid_kernel<<<grid, block>>>();
    cudaDeviceSynchronize();
}
void print_cord(){
    int inputWidth = 4;
    int blockDim = 2;
    int gridDim = inputWidth / blockDim;
    dim3 block(blockDim, blockDim);
    dim3 grid(gridDim, gridDim);
    print_cord_kernel<<<grid, block>>>();
    cudaDeviceSynchronize();
}
int main() {
    /*
    synchronize是同步的意思,有几种synchronize
    cudaDeviceSynchronize: CPU与GPU端完成同步,CPU不执行之后的语句,知道这个语句以前的所有cuda操作结束
    cudaStreamSynchronize: 跟cudaDeviceSynchronize很像,但是这个是针对某一个stream的。只同步指定的stream中的cpu/gpu操作,其他的不管
    cudaThreadSynchronize: 现在已经不被推荐使用的方法
    __syncthreads:         线程块内同步
    */
    // print_one_dim();
    // print_two_dim();
    print_cord();
    return 0;
}
-  注意 
 __global__表示核函数kernel
-  需求:找到某个block下面的thread 
  
代码如下(先走z,然后y,最后z)
 
 一般的优化
 
二、理解.cu和.cpp的相互引用及Makefile
- 编译器
 不再是gcc或g++,而是nvcc,这样才不会编译报错
- 编译项目一指令
nvcc print_index.cu  -o app -I  /usr/local/cuda/include/
- cuda_check作用
 发生错误的时候告诉你错误发生在哪里
#define CUDA_CHECK(call) {                                                 \
    cudaError_t error = call;                                              \
    if (error != cudaSuccess) {                                            \
        printf("ERROR: %s:%d, ", __FILE__, __LINE__);                      \
        printf("CODE:%d, DETAIL:%s\n", error, cudaGetErrorString(error));  \
        exit(1);                                                           \
    }                                                                      \
}
三、利用CUDA矩阵乘法(matmul)计算、Error Handle 及硬件信息获取
1)矩阵乘法
- 目的
 理解使用cuda进行矩阵运算的加速方法,tile的用意
- 项目目录
  
2)Error Handle
- 项目目录
  
3)硬件信息获取
- 目标
 学习使用cuda runtime api显示GPU硬件信息,以及理解GPU硬件信息重要性
- 项目布局
  
- 打印效果
  
- 相关代码
int main(){
    int count;
    int index = 0;
    cudaGetDeviceCount(&count);
    while (index < count) {
        cudaSetDevice(index);
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, index);
        LOG("%-40s",             "*********************Architecture related**********************");
        LOG("%-40s%d%s",         "Device id: ",                   index, "");
        LOG("%-40s%s%s",         "Device name: ",                 prop.name, "");
        LOG("%-40s%.1f%s",       "Device compute capability: ",   prop.major + (float)prop.minor / 10, "");
        LOG("%-40s%.2f%s",       "GPU global meory size: ",       (float)prop.totalGlobalMem / (1<<30), "GB");
        LOG("%-40s%.2f%s",       "L2 cache size: ",               (float)prop.l2CacheSize / (1<<20), "MB");
        LOG("%-40s%.2f%s",       "Shared memory per block: ",     (float)prop.sharedMemPerBlock / (1<<10), "KB");
        LOG("%-40s%.2f%s",       "Shared memory per SM: ",        (float)prop.sharedMemPerMultiprocessor / (1<<10), "KB");
        LOG("%-40s%.2f%s",       "Device clock rate: ",           prop.clockRate*1E-6, "GHz");
        LOG("%-40s%.2f%s",       "Device memory clock rate: ",    prop.memoryClockRate*1E-6, "Ghz");
        LOG("%-40s%d%s",         "Number of SM: ",                prop.multiProcessorCount, "");
        LOG("%-40s%d%s",         "Warp size: ",                   prop.warpSize, "");
        LOG("%-40s",             "*********************Parameter related************************");
        LOG("%-40s%d%s",         "Max block numbers: ",           prop.maxBlocksPerMultiProcessor, "");
        LOG("%-40s%d%s",         "Max threads per block: ",       prop.maxThreadsPerBlock, "");
        LOG("%-40s%d:%d:%d%s",   "Max block dimension size:",     prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2], "");
        LOG("%-40s%d:%d:%d%s",   "Max grid dimension size: ",     prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2], "");
        index ++;
        printf("\n");
    }
    return 0;
}
- 知道参数的重要性
 很多时候编译.cu代码需要在nvcc之后加上编译信息,就需要打印GPU信息出来方便编译(比如共享内存的使用对cuda程序的加速很重要,可以动态修改共享内存和L1 Cache,而且知道作为调度thread的warp是由多少个thread组成的,也可以提高利用率)

















![[Hive]五、Hive 源码编译](https://i-blog.csdnimg.cn/direct/1b82ef5360c74109950d8954c08d927e.png)

