CANN学习中心:AddCustom算子工程示例
完整示例AddCustom 算子工程【免费下载链接】cann-learning-hubCANN 学习中心仓支持在线互动运行、边学边练提供教程、示例与优化方案一站式助力昇腾开发者快速上手。项目地址: https://gitcode.com/cann/cann-learning-hub1. 创建算子原型定义文件add_custom.json:[{ op: AddCustom, input_desc: [ { name: x, param_type: required, format: [ND, ND], type: [float16, float] }, { name: y, param_type: required, format: [ND, ND], type: [float16, float] } ], output_desc: [ { name: z, param_type: required, format: [ND, ND], type: [float16, float] } ] }]2. 创建算子工程msopgen gen -i add_custom.json \ -c ai_core-ascend910b1 \ -lan cpp \ -out custom_op生成的目录结构:custom_op/ ├── framework/ ├── op_host/ │ ├── add_custom.cpp │ └── CMakeLists.txt ├── op_kernel/ │ ├── add_custom_tiling.h │ ├── add_custom.cpp │ └── CMakeLists.txt ├── CMakeLists.txt ├── CMakePresets.json └── build.sh3. Tiling 数据结构op_kernel/add_custom_tiling.h:#ifndef ADD_CUSTOM_TILING_H #define ADD_CUSTOM_TILING_H #include cstdint struct TilingData { uint32_t totalLength; uint32_t tileNum; }; #endif4. Host 侧实现op_host/add_custom.cpp:#include register/op_def_registry.h #include ../op_kernel/add_custom_tiling.h namespace optiling { static ge::graphStatus TilingFunc(gert::TilingContext *context) { uint32_t totalLength context-GetInputShape(0)-GetOriginShape().GetShapeSize(); context-SetBlockDim(8); TilingData *tiling context-GetTilingDataTilingData(); tiling-totalLength totalLength; tiling-tileNum 1; return ge::GRAPH_SUCCESS; } } // namespace optiling namespace ge { static graphStatus InferShape(gert::InferShapeContext *context) { const gert::Shape *inputShape context-GetInputShape(0); gert::Shape *outputShape context-GetOutputShape(0); *outputShape *inputShape; return GRAPH_SUCCESS; } static graphStatus InferDataType(gert::InferDataTypeContext *context) { context-SetOutputDataType(0, context-GetInputDataType(0)); return ge::GRAPH_SUCCESS; } } // namespace ge namespace ops { class AddCustom : public OpDef { public: explicit AddCustom(const char *name) : OpDef(name) { this-Input(x) .ParamType(REQUIRED) .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) .Format({ge::FORMAT_ND, ge::FORMAT_ND}); this-Input(y) .ParamType(REQUIRED) .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) .Format({ge::FORMAT_ND, ge::FORMAT_ND}); this-Output(z) .ParamType(REQUIRED) .DataType({ge::DT_FLOAT16, ge::DT_FLOAT}) .Format({ge::FORMAT_ND, ge::FORMAT_ND}); this-SetInferShape(ge::InferShape) .SetInferDataType(ge::InferDataType); this-AICore() .SetTiling(optiling::TilingFunc) .AddConfig(ascend910b); } }; OP_ADD(AddCustom); } // namespace ops5. Kernel 侧实现op_kernel/add_custom.cpp:#include kernel_operator.h #include add_custom_tiling.h constexpr int32_t BUFFER_NUM 1; template class dtypeX, class dtypeY, class dtypeZ class KernelAdd { public: __aicore__ inline void Init( GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) { this-blockLength totalLength / AscendC::GetBlockNum(); this-tileNum tileNum; this-tileLength this-blockLength / tileNum / BUFFER_NUM; xGm.SetGlobalBuffer((__gm__ dtypeX *)x this-blockLength * AscendC::GetBlockIdx(), this-blockLength); yGm.SetGlobalBuffer((__gm__ dtypeY *)y this-blockLength * AscendC::GetBlockIdx(), this-blockLength); zGm.SetGlobalBuffer((__gm__ dtypeZ *)z this-blockLength * AscendC::GetBlockIdx(), this-blockLength); pipe.InitBuffer(inQueueX, BUFFER_NUM, this-tileLength * sizeof(dtypeX)); pipe.InitBuffer(inQueueY, BUFFER_NUM, this-tileLength * sizeof(dtypeY)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this-tileLength * sizeof(dtypeZ)); } __aicore__ inline void Process() { int32_t loopCount this-tileNum * BUFFER_NUM; for (int32_t i 0; i loopCount; i) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensordtypeX xLocal inQueueX.AllocTensordtypeX(); AscendC::LocalTensordtypeY yLocal inQueueY.AllocTensordtypeY(); AscendC::DataCopy(xLocal, xGm[progress * this-tileLength], this-tileLength); AscendC::DataCopy(yLocal, yGm[progress * this-tileLength], this-tileLength); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensordtypeX xLocal inQueueX.DeQuedtypeX(); AscendC::LocalTensordtypeY yLocal inQueueY.DeQuedtypeY(); AscendC::LocalTensordtypeZ zLocal outQueueZ.AllocTensordtypeZ(); AscendC::Add(zLocal, xLocal, yLocal, this-tileLength); outQueueZ.EnQuedtypeZ(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensordtypeZ zLocal outQueueZ.DeQuedtypeZ(); AscendC::DataCopy(zGm[progress * this-tileLength], zLocal, this-tileLength); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQueAscendC::TPosition::VECIN, BUFFER_NUM inQueueX; AscendC::TQueAscendC::TPosition::VECIN, BUFFER_NUM inQueueY; AscendC::TQueAscendC::TPosition::VECOUT, BUFFER_NUM outQueueZ; AscendC::GlobalTensordtypeX xGm; AscendC::GlobalTensordtypeY yGm; AscendC::GlobalTensordtypeZ zGm; uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; }; extern C __global__ __aicore__ void add_custom( GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { REGISTER_TILING_DEFAULT(TilingData); GET_TILING_DATA_WITH_STRUCT(TilingData, tiling_data, tiling); KernelAddDTYPE_X, DTYPE_Y, DTYPE_Z op; op.Init(x, y, z, tiling_data.totalLength, tiling_data.tileNum); op.Process(); }6. 编译安装# 编译 cd custom_op bash build.sh # 安装 ./build_out/custom_opp_*.run --install-path${HOME}/ # 设置环境 source ${HOME}/vendors/customize/bin/set_env.bash7. 测试代码test/main.cpp:#include algorithm #include cstdint #include iostream #include vector #include acl/acl.h #include aclnn_add_custom.h // 自动生成的头文件 #define SUCCESS 0 #define FAILED 1 #define CHECK_RET(cond, return_expr) \ do { \ if (!(cond)) { \ return_expr; \ } \ } while (0) #define LOG_PRINT(message, ...) \ do { \ printf(message, ##__VA_ARGS__); \ } while (0) int64_t GetShapeSize(const std::vectorint64_t shape) { int64_t shapeSize 1; for (auto i : shape) { shapeSize * i; } return shapeSize; } int Init(int32_t deviceId, aclrtStream *stream) { auto ret aclInit(nullptr); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclInit failed. ERROR: %d\n, ret); return FAILED); ret aclrtSetDevice(deviceId); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclrtSetDevice failed. ERROR: %d\n, ret); return FAILED); ret aclrtCreateStream(stream); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclrtCreateStream failed. ERROR: %d\n, ret); return FAILED); return SUCCESS; } template typename T int CreateAclTensor(const std::vectorT hostData, const std::vectorint64_t shape, void **deviceAddr, aclDataType dataType, aclTensor **tensor) { auto size GetShapeSize(shape) * sizeof(T); auto ret aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclrtMalloc failed. ERROR: %d\n, ret); return FAILED); ret aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclrtMemcpy failed. ERROR: %d\n, ret); return FAILED); *tensor aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(), shape.size(), *deviceAddr); return SUCCESS; } void DestroyResources(std::vectorvoid * tensors, std::vectorvoid * deviceAddrs, aclrtStream stream, int32_t deviceId, void *workspaceAddr nullptr) { for (uint32_t i 0; i tensors.size(); i) { if (tensors[i] ! nullptr) { aclDestroyTensor(reinterpret_castaclTensor *(tensors[i])); } if (deviceAddrs[i] ! nullptr) { aclrtFree(deviceAddrs[i]); } } if (workspaceAddr ! nullptr) { aclrtFree(workspaceAddr); } aclrtDestroyStream(stream); aclrtResetDevice(deviceId); aclFinalize(); } int main(int argc, char **argv) { // 1. 初始化设备和 stream int32_t deviceId 0; aclrtStream stream; auto ret Init(deviceId, stream); CHECK_RET(ret 0, LOG_PRINT(Init acl failed. ERROR: %d\n, ret); return FAILED); // 2. 创建输入输出张量 std::vectorint64_t inputXShape {8, 2048}; std::vectorint64_t inputYShape {8, 2048}; std::vectorint64_t outputZShape {8, 2048}; void *inputXDeviceAddr nullptr; void *inputYDeviceAddr nullptr; void *outputZDeviceAddr nullptr; aclTensor *inputX nullptr; aclTensor *inputY nullptr; aclTensor *outputZ nullptr; // 准备 host 数据 std::vectoraclFloat16 inputXHostData(inputXShape[0] * inputXShape[1]); std::vectoraclFloat16 inputYHostData(inputYShape[0] * inputYShape[1]); std::vectoraclFloat16 outputZHostData(outputZShape[0] * outputZShape[1]); for (int i 0; i inputXShape[0] * inputXShape[1]; i) { inputXHostData[i] aclFloatToFloat16(1.0); inputYHostData[i] aclFloatToFloat16(2.0); outputZHostData[i] aclFloatToFloat16(0.0); } std::vectorvoid * tensors {inputX, inputY, outputZ}; std::vectorvoid * deviceAddrs {inputXDeviceAddr, inputYDeviceAddr, outputZDeviceAddr}; // 创建 aclTensor ret CreateAclTensor(inputXHostData, inputXShape, inputXDeviceAddr, aclDataType::ACL_FLOAT16, inputX); CHECK_RET(ret ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); ret CreateAclTensor(inputYHostData, inputYShape, inputYDeviceAddr, aclDataType::ACL_FLOAT16, inputY); CHECK_RET(ret ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); ret CreateAclTensor(outputZHostData, outputZShape, outputZDeviceAddr, aclDataType::ACL_FLOAT16, outputZ); CHECK_RET(ret ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); // 3. 调用算子 API二段式 uint64_t workspaceSize 0; aclOpExecutor *executor; // 第一段获取 workspace 大小 ret aclnnAddCustomGetWorkspaceSize(inputX, inputY, outputZ, workspaceSize, executor); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclnnAddCustomGetWorkspaceSize failed. ERROR: %d\n, ret); DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); // 分配 workspace void *workspaceAddr nullptr; if (workspaceSize 0) { ret aclrtMalloc(workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(allocate workspace failed. ERROR: %d\n, ret); DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED); } // 第二段执行算子 ret aclnnAddCustom(workspaceAddr, workspaceSize, executor, stream); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclnnAddCustom failed. ERROR: %d\n, ret); DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED); // 4. 同步等待 ret aclrtSynchronizeStream(stream); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(aclrtSynchronizeStream failed. ERROR: %d\n, ret); DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED); // 5. 获取输出结果 auto size GetShapeSize(outputZShape); std::vectoraclFloat16 resultData(size, 0); ret aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr, size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret ACL_SUCCESS, LOG_PRINT(copy result failed. ERROR: %d\n, ret); DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED); // 6. 销毁资源 DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); // 7. 验证结果 std::vectoraclFloat16 goldenData(size, aclFloatToFloat16(3.0)); LOG_PRINT(result is:\n); for (int64_t i 0; i 10; i) { LOG_PRINT(%.1f , aclFloat16ToFloat(resultData[i])); } LOG_PRINT(\n); if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) { LOG_PRINT(test pass\n); } else { LOG_PRINT(test failed\n); return FAILED; } return SUCCESS; }8. 编译运行测试# 编译测试代码 g -I$ASCEND_TOOLKIT_HOME/include \ -I${HOME}/vendors/customize/op_api/include \ -L$ASCEND_TOOLKIT_HOME/lib64 \ -L${HOME}/vendors/customize/op_api/lib \ test/main.cpp \ -lcust_opapi -lnnopbase -lacl_rt \ -o test_add # 运行测试 source ${HOME}/vendors/customize/bin/set_env.bash ./test_add预期输出:result is: 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 test pass【免费下载链接】cann-learning-hubCANN 学习中心仓支持在线互动运行、边学边练提供教程、示例与优化方案一站式助力昇腾开发者快速上手。项目地址: https://gitcode.com/cann/cann-learning-hub创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2598964.html
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!