CUDA编程学习（四）内存拷贝

news2026/3/16 17:43:24

本篇文章介绍如何把存储在主机内存上的数据拷贝到存储到设备显卡的内存上。我们将逐步分析代码完整代码如下#include cuda_runtime.h #include ../common/common.h #include stdio.h void initialData(float *ip,int size) { time_t t; srand((unsigned)time(t)); printf(Matrix is :); for (int i0;isize;i) { ip[i](float)(rand() 0xFF) / 10.0f; printf(%.2f,ip[i]); } printf(\n); return; } int main(int argc, char **argv) { int nDeviceNumber 0; cudaError_t error ErrorCheck(cudaGetDeviceCount(nDeviceNumber),__FILE__,__LINE__); if(error ! cudaSuccess || nDeviceNumber 0) { printf(No CUDA campatable GPU found!\n); return -1; } int dev0; error ErrorCheck(cudaSetDevice(dev),__FILE__,__LINE__); if(error!cudaSuccess) { printf(fail to set GPU 0 for computing\n); return -1; } else{ printf(Set GPU 0 for computing\n); } int nElem 16; size_t nBytes nElem * sizeof(float); float *h_A,*h_B,*gpuRef; h_A (float *)malloc(nBytes); h_B (float *)malloc(nBytes); gpuRef (float *)malloc(nBytes); if(NULL ! h_A NULL ! h_B NULL !gpuRef) { printf(allocate memory successfully\n); } else{ printf(fail to allocate memory\n); return -1; } initialData(h_A,nElem); initialData(h_B,nElem); memset(gpuRef,0,nBytes); float *d_A,*d_B,*d_C; cudaMalloc((float **)d_A,nBytes); cudaMalloc((float **)d_B,nBytes); cudaMalloc((float **)d_C,nBytes); if(d_A NULL || d_B NULL || d_CNULL) { printf(fail to allocate memory for GPU\n); free(h_A); free(h_B); free(gpuRef); return -1; } else { printf(successfully allocate memory for GPU\n); } if(cudaSuccess cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice) cudaSuccess cudaMemcpy(d_B,h_B,nBytes,cudaMemcpyHostToDevice) cudaSuccess cudaMemcpy(d_C,gpuRef,nBytes,cudaMemcpyHostToDevice)) { printf(Successfully copy data from CPU to GPU!\r\n); } else { printf(fail to copy data from CPU to GPU!\r\n); } free(h_A); free(h_B); free(gpuRef); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); return 0; }common.h中定义了一个函数用于检测CUDA程序运行中的状态是否正确。#includesys/time.h #includecuda_runtime.h #includestdio.h cudaError_t ErrorCheck(cudaError_t status, const char* filename,int lineNumber) { if (status ! cudaSuccess) { printf(CUDA API error:\r\ncode%d,name%s,description%s\r\nfile%s,line%d\r\n, status,cudaGetErrorName(status),cudaGetErrorString(status),filename,lineNumber); return status; } return status; }首先看第一段代码定义了初始化矩阵的函数void initialData(float *ip,int size) { time_t t; //时间变量用于获取系统现在的时间它的值时刻都在改变 srand((unsigned)time(t)); //与rand()函数搭配使用利用t来生成不断变化的种子使得rand()函数每次运行的结果都不一样 printf(Matrix is: ); for (int i0; isize;i) { ip[i] (float)(rand()0xFF)/10.0f; //利用rand函数生存随机数字给数组ip赋值 printf(%.2f,ip[i]); } printf(\n); return; }main函数中的第一部分代码用于检测GPU是否可用int nDeviceNumber 0; //该变量用于存储可用GPU的数量初始值为0 cudaError_t error ErrorCheck(cudaGetDeviceCount(nDeviceNumber),__FILE__,__LINE__); //用于检测可用GPU数量以及其是否可用 if(error ! cudaSuccess || nDeviceNumber 0) { //如果可用显卡数为0或者cuda启动失败退出进程 printf(No CUDA campatable GPU found!\n); return -1; }int dev0; //GPU的编号默认为0 error ErrorCheck(cudaSetDevice(dev),__FILE__,__LINE__); //设置显卡状态检测0号显卡是否可用 if(error ! cudaSuccess) { printf(fail to set GPU 0 for computing\n); return -1; } else { printf(Set GPU 0 for computing\n); }第二部分开始分配内存。先分配主机内存再分配设备内存int nElem 16; size_t nBytes nElem * sizeof(float); //待分配的内存空间的大小 float *h_A,*h_B,*gpuRef; h_A (float *)malloc(nBytes); h_B (float *)malloc(nBytes); gpuRef (float *)malloc(nBytes); //定义主机内存利用malloc()函数分配指定大小内存 if(NULL ! h_A NULL ! h_B NULL !gpuRef) { //检查是否分配成功 printf(allocate memory successfully\n); } else{ printf(fail to allocate memory\n); return -1; } initialData(h_A,nElem); initialData(h_B,nElem); //因为分配的内存空间上可能已经存储了数据所以需要初始化 memset(gpuRef,0,nBytes); //这块内存上指定长度空间存储的数值都设定为0分配设备内存float *d_A,*d_B,*d_C; cudaMalloc((float **)d_A,nBytes); cudaMalloc((float **)d_B,nBytes); cudaMalloc((float **)d_C,nBytes); //利用cuda专门用于分配内存的函数给指定地址分配内存空间。 //由于C语言传参默认传的是数值如果要对传参的值进行改变必须传其指针地址 if(d_A NULL || d_B NULL || d_CNULL) { //如果分配GPU显存失败退出并释放掉已经分配的CPU内存 printf(fail to allocate memory for GPU\n); free(h_A); free(h_B); free(gpuRef); return -1; } else { printf(successfully allocate memory for GPU\n); }第三部分分配好了主机内存和设备显存以后开始进行数据拷贝if(cudaSuccess cudaMemcpy(d_A,h_A,nBytes,cudaMemcpyHostToDevice) cudaSuccess cudaMemcpy(d_B,h_B,nBytes,cudaMemcpyHostToDevice) cudaSuccess cudaMemcpy(d_C,gpuRef,nBytes,cudaMemcpyHostToDevice)) { printf(Successfully copy data from CPU to GPU!\r\n); } //判断数据拷贝是否成功。cudaMemcpy()函数的三个传参分别为设备地址、主机地址、内存大小、以及拷贝方向(从设备到主机还是从主机到设备这里是从主机到设备) else { printf(fail to copy data from CPU to GPU!\r\n); } free(h_A); free(h_B); free(gpuRef); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); //程序结束释放所有内存 return 0;使用nvcc进行编译运行可执行文件得到Set GPU 0 for computing allocate memory successfully Matrix is : Matrix is : successfully allocate memory for GPU Successfully copy data from CPU to GPU! [1] Done /usr/bin/gdb --interpretermi --tty${DbgTerm} 0/tmp/Microsoft-MIEngine-In-cuqcf0bg.mrd 1/tmp/Microsoft-MIEngine-Out-f3ch05dq.ck0说明拷贝成功

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/2416684.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！