2.1.1 检查块和线程索引
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cuda_runtime.h>
#define CHECK(call) \
 {\
    const cudaError_t error = call; \
    if (error != cudaSuccess)\
    {\
        printf("Error: %s: %d\n", __FILE__, __LINE__);\
        printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\
        exit(1);\
    }\
}
void initialInt( int * ip, int size)
{
    for (int i =0; i < size; i ++)
    {
        ip[i] = i;
    }
}
void printMatrix(int *C, const int nx, const int ny)
{
    int *ic = C;
    printf("\n matrix : (%d, %d)\n", nx, ny);
    for (int iy = 0; iy < ny; iy++)
    {
        for (int ix =0; ix < nx; ix++){
            printf("%3d",ic[ix]);
        }
        ic += nx;
        printf("\n");
    }
    printf("\n");
}
__global__ void printThreadIndex(int *A, const int nx, const int ny)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy*nx + ix;
    printf("threadidx: (%d ,%d) blockidx:(%d ,%d) coordinate: (%d ,%d) global index: (%2d ival %2d)\n", 
    threadIdx.x, threadIdx.y,
    blockIdx.x, blockIdx.y,
    ix, iy,
    idx, A[idx]
    );
}
int main(int argc , char **argv)
{
    printf("%s starting\n", argv[0]);
    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("Using Device %d : %s\n", dev, deviceprop.name);
    CHECK(cudaSetDevice(dev));
    // set matrix 
    int nx = 8;
    int ny = 6;
    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);
    // malloc host memory
    int * h_A;
    h_A = (int *) malloc(nBytes);
    //initial int
    initialInt(h_A, nxy);
    printMatrix(h_A, nx, ny);
    // device
    int *d_MatA;
    cudaMalloc((void **)&d_MatA, nBytes);
    cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
    dim3 block(4,2);
    dim3 grid ((nx + block.x - 1)/block.x, (ny + block.y - 1)/ block.y);
    printf("execution config grid (%d, %d), block (%d, %d)\n", grid.x, grid.y, block.x, block.y);
    printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);
    cudaDeviceSynchronize();
    cudaFree(d_MatA);
    free(h_A);
    cudaDeviceReset();
    
    return 0;
}
 
生成的2D 矩阵 8X6 在HOST里的样子
 matrix : (8, 6)
 0 1 2 3 4 5 6 7
 8 9 10 11 12 13 14 15
 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31
 32 33 34 35 36 37 38 39
 40 41 42 43 44 45 46 47
因为block(4,2)定义的block.x =4, block.y = 2, 所以grid的尺寸 grid(2,3), grid.x = 2, grid.y = 3
 
 而后在printThreadindex里输出
 threadidx: (0 ,0) blockidx:(0 ,1) coordinate: (0 ,2) global index: (16 ival 16)
 threadidx: (1 ,0) blockidx:(0 ,1) coordinate: (1 ,2) global index: (17 ival 17)
 threadidx: (2 ,0) blockidx:(0 ,1) coordinate: (2 ,2) global index: (18 ival 18)
 threadidx: (3 ,0) blockidx:(0 ,1) coordinate: (3 ,2) global index: (19 ival 19)
 threadidx: (0 ,1) blockidx:(0 ,1) coordinate: (0 ,3) global index: (24 ival 24)
 threadidx: (1 ,1) blockidx:(0 ,1) coordinate: (1 ,3) global index: (25 ival 25)
 threadidx: (2 ,1) blockidx:(0 ,1) coordinate: (2 ,3) global index: (26 ival 26)
 threadidx: (3 ,1) blockidx:(0 ,1) coordinate: (3 ,3) global index: (27 ival 27)
 threadidx: (0 ,0) blockidx:(1 ,1) coordinate: (4 ,2) global index: (20 ival 20)
 threadidx: (1 ,0) blockidx:(1 ,1) coordinate: (5 ,2) global index: (21 ival 21)
 threadidx: (2 ,0) blockidx:(1 ,1) coordinate: (6 ,2) global index: (22 ival 22)
 threadidx: (3 ,0) blockidx:(1 ,1) coordinate: (7 ,2) global index: (23 ival 23)
 threadidx: (0 ,1) blockidx:(1 ,1) coordinate: (4 ,3) global index: (28 ival 28)
 threadidx: (1 ,1) blockidx:(1 ,1) coordinate: (5 ,3) global index: (29 ival 29)
 threadidx: (2 ,1) blockidx:(1 ,1) coordinate: (6 ,3) global index: (30 ival 30)
 threadidx: (3 ,1) blockidx:(1 ,1) coordinate: (7 ,3) global index: (31 ival 31)
 threadidx: (0 ,0) blockidx:(1 ,0) coordinate: (4 ,0) global index: ( 4 ival 4)
 threadidx: (1 ,0) blockidx:(1 ,0) coordinate: (5 ,0) global index: ( 5 ival 5)
 threadidx: (2 ,0) blockidx:(1 ,0) coordinate: (6 ,0) global index: ( 6 ival 6)
 threadidx: (3 ,0) blockidx:(1 ,0) coordinate: (7 ,0) global index: ( 7 ival 7)
 threadidx: (0 ,1) blockidx:(1 ,0) coordinate: (4 ,1) global index: (12 ival 12)
 threadidx: (1 ,1) blockidx:(1 ,0) coordinate: (5 ,1) global index: (13 ival 13)
 threadidx: (2 ,1) blockidx:(1 ,0) coordinate: (6 ,1) global index: (14 ival 14)
 threadidx: (3 ,1) blockidx:(1 ,0) coordinate: (7 ,1) global index: (15 ival 15)
 threadidx: (0 ,0) blockidx:(0 ,2) coordinate: (0 ,4) global index: (32 ival 32)
 threadidx: (1 ,0) blockidx:(0 ,2) coordinate: (1 ,4) global index: (33 ival 33)
 threadidx: (2 ,0) blockidx:(0 ,2) coordinate: (2 ,4) global index: (34 ival 34)
 threadidx: (3 ,0) blockidx:(0 ,2) coordinate: (3 ,4) global index: (35 ival 35)
 threadidx: (0 ,1) blockidx:(0 ,2) coordinate: (0 ,5) global index: (40 ival 40)
 threadidx: (1 ,1) blockidx:(0 ,2) coordinate: (1 ,5) global index: (41 ival 41)
 threadidx: (2 ,1) blockidx:(0 ,2) coordinate: (2 ,5) global index: (42 ival 42)
 threadidx: (3 ,1) blockidx:(0 ,2) coordinate: (3 ,5) global index: (43 ival 43)
 threadidx: (0 ,0) blockidx:(0 ,0) coordinate: (0 ,0) global index: ( 0 ival 0)
 threadidx: (1 ,0) blockidx:(0 ,0) coordinate: (1 ,0) global index: ( 1 ival 1)
 threadidx: (2 ,0) blockidx:(0 ,0) coordinate: (2 ,0) global index: ( 2 ival 2)
 threadidx: (3 ,0) blockidx:(0 ,0) coordinate: (3 ,0) global index: ( 3 ival 3)
 threadidx: (0 ,1) blockidx:(0 ,0) coordinate: (0 ,1) global index: ( 8 ival 8)
 threadidx: (1 ,1) blockidx:(0 ,0) coordinate: (1 ,1) global index: ( 9 ival 9)
 threadidx: (2 ,1) blockidx:(0 ,0) coordinate: (2 ,1) global index: (10 ival 10)
 threadidx: (3 ,1) blockidx:(0 ,0) coordinate: (3 ,1) global index: (11 ival 11)
 threadidx: (0 ,0) blockidx:(1 ,2) coordinate: (4 ,4) global index: (36 ival 36)
 threadidx: (1 ,0) blockidx:(1 ,2) coordinate: (5 ,4) global index: (37 ival 37)
 threadidx: (2 ,0) blockidx:(1 ,2) coordinate: (6 ,4) global index: (38 ival 38)
 threadidx: (3 ,0) blockidx:(1 ,2) coordinate: (7 ,4) global index: (39 ival 39)
 threadidx: (0 ,1) blockidx:(1 ,2) coordinate: (4 ,5) global index: (44 ival 44)
 threadidx: (1 ,1) blockidx:(1 ,2) coordinate: (5 ,5) global index: (45 ival 45)
 threadidx: (2 ,1) blockidx:(1 ,2) coordinate: (6 ,5) global index: (46 ival 46)
 threadidx: (3 ,1) blockidx:(1 ,2) coordinate: (7 ,5) global index: (47 ival 47)



















