Linux 下 DMA 内存映射浅析

序

系统 I/O 设备驱动程序通常调用其特定子系统的接口为 DMA 分配内存，但最终会调到 DMA 子系统的dma_alloc_coherent()/dma_alloc_attrs() 等接口。

关于 dma_alloc_coherent 接口详细的代码讲解、调用流程，可以参考这篇文章，我觉得写的非常好！
深入浅出 Linux 中的 ARM IOMMU SMMU III

注：本篇文章代码，均节选自 Linux 6.53 版本

1、DMA 映射类型

Linux系统支持两种 DMA 映射类型：一致性 DMA 映射（Consistent DMA Mapping）和流式 DMA映射（Streaming DMA Mapping）。二者的核心差异在于：

一致性 DMA 映射情况下，无论是 CPU 还是设备，在访问 DMA 内存时，全程都不使用 Cache，从而避免数据一致性问题；
流式 DMA 映射则只在 DMA 传输数据时，通过软件操作来处理 Cache 的同步，以减轻关闭 Cache 对性能的影响。

2、一致性 DMA

一致性 DMA 映射本质上是利用了硬件的支持，禁用了 DMA 内存缓存区的 Cache 功能。 CPU 和 DMA controller 在发起对 DMA 缓冲区的并行访问的时候不需要考虑 cache 的影响，也就是说不需要软件进行 Cache 操作，CPU 和 DMA controller 都可以看到对方对 DMA 缓冲区的更新。

dma_alloc_coherent

DMA 一致性映射使用 dma_alloc_coherent 接口来分配 DMA 内存，实现流程如下：

dma_alloc_coherent（dma_alloc_attrs）

	--> dma_alloc_from_dev_coherent (如果 Device 绑定了Reserved Memory)
	
	--> dma_direct_alloc (如果 Device 没有绑定 Reserved Memory )
	
	--> dma_ops->alloc (如果 Device 绑定了 IOMMU，使用 IOMMU 申请接口)

2.1 Device 绑定了 Reserved Memory

dma_alloc_from_dev_coherent 函数从设备的 coherent 内存池中分配内存。coherent 内存池由设备驱动程序通过dma_declare_coherent_memory 接口创建，这个接口实现 (位于 kernel/dma/coherent.c 文件中) 如下：

static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
		dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
{
	struct dma_coherent_mem *dma_mem;
	int pages = size >> PAGE_SHIFT;
	void *mem_base;

	if (!size)
		return ERR_PTR(-EINVAL);

	mem_base = memremap(phys_addr, size, MEMREMAP_WC);
	if (!mem_base)
		return ERR_PTR(-EINVAL);
	......
}

void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
	......
	if (!addr && (flags & MEMREMAP_WT))
		addr = ioremap_wt(offset, size);

	if (!addr && (flags & MEMREMAP_WC))
		addr = ioremap_wc(offset, size);
	......
}

从 mem_base = memremap(phys_addr, size, MEMREMAP_WC); 这行代码不难看出，默认情况下，该接口分配的内存类型是 WC（Write Combine），最终调用到的是 ioremap_wc 接口。对于 ARM64 架构而言，这里分配的地址类型是 PROT_NORMAL_NC。

2.2 Device 没有绑定 Reserved Memory

在这里插入图片描述

void *dma_direct_alloc(struct device *dev, size_t size,
		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
	......
	/* For non-coherent device */
	if (!dev_is_dma_coherent(dev)) {
		/* 首先尝试调用 arch 目录下的 dma_alloc */
		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
		    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
		    !dev_is_dma_coherent(dev))
			return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
		
		/*
		 * If there is a global pool, always allocate from it for
		 * non-coherent devices.
		 */
		if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
			return dma_alloc_from_global_coherent(dev, size,
					dma_handle);
					
		/*
		 * Otherwise remap if the architecture is asking for it.  But
		 * given that remapping memory is a blocking operation we'll
		 * instead have to dip into the atomic pools.
		 */
		remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
		if (remap) {
			if (dma_direct_use_pool(dev, gfp))
				return dma_direct_alloc_from_pool(dev, size,
						dma_handle, gfp);
		} else {
			if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
				return NULL;
			set_uncached = true;
		}
	}
	......
	/* we always manually zero the memory once we are done */
	/* 从软件管理的 page 内存池中获取对应大小的 page */
	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
	if (!page)
		return NULL;

	/*
	 * dma_alloc_contiguous can return highmem pages depending on a
	 * combination the cma= arguments and per-arch setup.  These need to be
	 * remapped to return a kernel virtual address.
	 */
	if (PageHighMem(page)) {
		remap = true;
		set_uncached = false;
	}
	
	/* For coherent device */
	if (remap) {
		/* 设备页表属性（内存属性） */
		pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);

		if (force_dma_unencrypted(dev))
			prot = pgprot_decrypted(prot);

		/* remove any dirty cache lines on the kernel alias */
		arch_dma_prep_coherent(page, size);

		/* create a coherent mapping */
		/* 对刚刚申请的 page，重新 remap，建立页表属性 */
		ret = dma_common_contiguous_remap(page, size, prot,
				__builtin_return_address(0));
		if (!ret)
			goto out_free_pages;
	} else {
		ret = page_address(page);
		if (dma_set_decrypted(dev, ret, size))
			goto out_free_pages;
	}
	......
}

2.2.1 dma-coherent

关于设备是否是 dma-coherent（注意，dma-coherent 表示的是硬件维护一致性！），即 if (!dev_is_dma_coherent(dev)) ，由 dts 传入 dma-coherent 标志决定。例如，rk3568 dts 中的 rga 节点：

	rga: rga@ff680000 {
		compatible = "rockchip,rga2";
		dev_mode = <1>;
		reg = <0x0 0xff680000 0x0 0x1000>;
		interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH 0>;
		clocks = <&cru ACLK_RGA>, <&cru HCLK_RGA>, <&cru SCLK_RGA_CORE>;
		clock-names = "aclk_rga", "hclk_rga", "clk_rga";
		power-domains = <&power RK3399_PD_RGA>;
		dma-coherent;
		status = "okay";
	};

2.2.2 dma_alloc_from_global_coherent

当设备不支持硬件一致性时，若其支持 global dma pool，则将从 global dma pool 中分配 dma 内存。其中若要支持 global dma pool，需要在 dts 的 reserved-memory 中保留 compatible 值为 shared-dma-pool。以 rk3568 为例：

	reserved-memory {
		#address-cells = <2>;
		#size-cells = <2>;
		ranges;

		/* Reserve 128MB memory for hdmirx-controller@fdee0000 */
		cma {
			compatible = "shared-dma-pool";
			reusable;
			reg = <0x0 (256 * 0x100000) 0x0 (128 * 0x100000)>;
			linux,cma-default;
		};
	};

当 dts 包含以上节点时，则在 linux 内核初始化时，会通过以下接口为其初始化 global dma pool，并将相应的保留内存设置到该内存池中：

dma_init_reserved_memory                                    
	--> dma_init_global_coherent
 		--> dma_init_coherent_memory (该函数 2.1 章节有讲解)

2.2.3 dma_pgprot

dma_direct_alloc 接口中调用的 dma_pgprot 接口，默认情况下，调用的是 pgprot_dmacoherent 接口

/*
 * Return the page attributes used for mapping dma_alloc_* memory, either in
 * kernel space if remapping is needed, or to userspace through dma_mmap_*.
 */
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
	if (dev_is_dma_coherent(dev))
		return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
	if (attrs & DMA_ATTR_WRITE_COMBINE)
		return pgprot_writecombine(prot);
#endif
	return pgprot_dmacoherent(prot);
}

在 arch/arm64/include/asm/pgtable.h 文件中：

/*
 * DMA allocations for non-coherent devices use what the Arm architecture calls
 * "Normal non-cacheable" memory, which permits speculation, unaligned accesses
 * and merging of writes.  This is different from "Device-nGnR[nE]" memory which
 * is intended for MMIO and thus forbids speculation, preserves access size,
 * requires strict alignment and can also force write responses to come from the
 * endpoint.
 */
#define pgprot_dmacoherent(prot) \
	__pgprot_modify(prot, PTE_ATTRINDX_MASK, \
			PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)

到这可以看到，使用 Direct 申请接口，默认情况下分配的 DMA 内存，是 MT_NORMAL_NC 类型的，和 2.1 章节一样。

2.3 使用 iommu

通过已注册的 dev 设备内存分配操作函数分配（ops->alloc）

当设备注册了 dma_ops，则该设备需要通过其 ops 对应的接口分配 dma 内存。以 armv8 的 iommu 为例，该接口的注册流程如下：

of_dma_configure（由设备驱动调用）
	--> of_dma_configure_id (这一步会去解析设备树节点，看是否有 “iommus” 属性)
		--> arch_setup_dma_ops
			-->iommu_setup_dma_ops

arch/arm64/mm/dma-mapping.c

void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
			const struct iommu_ops *iommu, bool coherent)
{
	int cls = cache_line_size_of_cpu();

	WARN_TAINT(!coherent && cls > ARCH_DMA_MINALIGN,
		   TAINT_CPU_OUT_OF_SPEC,
		   "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
		   dev_driver_string(dev), dev_name(dev),
		   ARCH_DMA_MINALIGN, cls);

	dev->dma_coherent = coherent;
	if (iommu)
		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);

	xen_setup_dma_ops(dev);
}

其中 iommu 对应的 dma_ops 定义如下：

const struct dma_map_ops iommu_dma_ops = {
     .flags = DMA_F_PCI_P2PDMA_SUPPORTED,
     .alloc = iommu_dma_alloc,
     .free = iommu_dma_free,
     .alloc_pages = dma_common_alloc_pages,
     .free_pages = dma_common_free_pages,
     .alloc_noncontiguous = iommu_dma_alloc_noncontiguous,
     .free_noncontiguous = iommu_dma_free_noncontiguous,
     .mmap = iommu_dma_mmap,
     .get_sgtable = iommu_dma_get_sgtable,
     .map_page = iommu_dma_map_page,
     .unmap_page = iommu_dma_unmap_page,
     .map_sg = iommu_dma_map_sg,
     .unmap_sg = iommu_dma_unmap_sg,
     .sync_single_for_cpu = iommu_dma_sync_single_for_cpu,
     .sync_single_for_device = iommu_dma_sync_single_for_device,
     .sync_sg_for_cpu = iommu_dma_sync_sg_for_cpu,
     .sync_sg_for_device = iommu_dma_sync_sg_for_device,
     .map_resource = iommu_dma_map_resource,
     .unmap_resource = iommu_dma_unmap_resource,
     .get_merge_boundary = iommu_dma_get_merge_boundary,
     .opt_mapping_size = iommu_dma_opt_mapping_size,
};

3、流式 DMA

与一致性映射接口不同，流式 DMA 映射接口不会分配内存，而是将输入参数中传入的内存映射为 DMA 地址。由于在 CPU 视角下，它也可能被映射成了 cache 类型的内存，故其对应的 cache 中可能含有 dirty 数据。为了确保 DMA 和 CPU 都能获取到正确的数据，则在 DMA 操作流程中，软件需要维护 cache 与主存数据的一致性。

流式 DMA 最大的一个特点就是带 cache，这会带来 cache 和内存的一致性问题。如果处理不好，会触发各种问题，使用的时候必须小心！！！

关于 cache 一致性问题，我前面有篇文章已经做了总结：ARM 架构下 cache 一致性问题整理

流式 DMA 包括以下两类接口：

3.1 内存映射接口

#define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)

这里以 dma_map_single 接口说明 DMA 内存的映射流程：

dma_map_single
	--> dma_map_single_attrs
		--> dma_map_page_attrs
			--> dma_map_direct (直接映射方式，使用SWIOTLB机制)
			--> get_dma_ops-->map_page (设备支持IOMMU，通过IOMMU提供的操作接口映射内存)

3.2 cache 维护接口

inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
    size_t size, enum dma_data_direction dir)
inline void dma_sync_single_for_device(struct device *dev,
    dma_addr_t addr, size_t size, enum dma_data_direction dir)
inline void dma_sync_sg_for_cpu(struct device *dev,
    struct scatterlist *sg, int nelems, enum dma_data_direction dir)
inline void dma_sync_sg_for_device(struct device *dev,
    struct scatterlist *sg, int nelems, enum dma_data_direction dir)

dma_direct_sync_single_for_cpu

如果你需要多次访问同一个流式映射DMA缓冲区，并且在DMA传输之间读写DMA缓冲区上的数据，这时候你需要使用dma_sync_single_for_cpu进行DMA缓冲区的sync操作，以便CPU和设备可以看到最新的、正确的数据。

dma_direct_sync_single_for_cpu 接口，最终调用的 arch_sync_dma_for_cpu。以 ARM64 架构为例，其实最终就是刷 cache。

arch/arm64/mm/dma-mapping.c

void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
			      enum dma_data_direction dir)
{
	unsigned long start = (unsigned long)phys_to_virt(paddr);

	dcache_clean_poc(start, start + size);
}

void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
			   enum dma_data_direction dir)
{
	unsigned long start = (unsigned long)phys_to_virt(paddr);

	if (dir == DMA_TO_DEVICE)
		return;

	dcache_inval_poc(start, start + size);
}

void arch_dma_prep_coherent(struct page *page, size_t size)
{
	unsigned long start = (unsigned long)page_address(page);

	dcache_clean_poc(start, start + size);
}