DPDK开发之KNI模块代码实现
- 背景
 - KNI实现原理 -- ifreq
 - 代码实现
 - 总结
 
背景
在DPDK开发的时候,如果有些协议不想处理,只处理关注的协议,可以把其他协议写回内核,让内核处理。此时的DPDK就起到分发的作用,类似一个过滤器。
KNI实现原理 – ifreq
主要利用内核的/dev/net/tun。做VPN时也会用到这个设备文件。
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <net/if.h>
#include <linux/if_tun.h>
#include <errno.h>
#include <sys/ioctl.h>
int tun_alloc(char *dev)
{
    struct ifreq ifr;
    memset(&ifr,0,sizeof(ifr));
    int fd=open("/dev/net/tun",O_RDWR);
    if(fd<0)
        return -1;
    // IFF_TAP针对的是以太网协议,需要传入MAC;TUN主要针对IP层协议
    ifr.ifr_flags=IFF_TAP|IFF_NO_PI;
    memcpy(ifr.ifr_name,dev,strlen(dev));
    int err;
    printf("fd = %d, dev = %s, len = %ld\n",fd,dev,strlen(dev));
    // 设置进去
    if((err=ioctl(fd,TUNSETIFF,(char *)&ifr))<0)
    {
        printf("ioctl fail(%d): %s\n",err,strerror(errno));
        close(fd);
        return err;
    }
    return fd;
}
int main()
{
    int code = tun_alloc("MyDev");
    printf("return code %d\n",code);
    getchar();
    return 0;
}
 
特别注意,ifr.ifr_name不能有空格。
 编译:
gcc -o ifr ifr.c
 
执行后,使用如下命名查询:
ifconfig -a
 
可以看到多了MyDev。
MyDev     Link encap:以太网  硬件地址 c2:44:70:f2:79:f9  
          BROADCAST MULTICAST  MTU:1500  跃点数:1
          接收数据包:0 错误:0 丢弃:0 过载:0 帧数:0
          发送数据包:0 错误:0 丢弃:0 过载:0 载波:0
          碰撞:0 发送队列长度:1000 
          接收字节:0 (0.0 B)  发送字节:0 (0.0 B)
ens33     Link encap:以太网  硬件地址 00:0c:29:79:9b:f7  
          inet 地址:192.168.0.106  广播:192.168.0.255  掩码:255.255.255.0
          inet6 地址: fe80::b608:7cba:aa19:e2d/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  跃点数:1
          接收数据包:7543 错误:0 丢弃:0 过载:0 帧数:0
          发送数据包:4518 错误:0 丢弃:0 过载:0 载波:0
          碰撞:0 发送队列长度:1000 
          接收字节:3014986 (3.0 MB)  发送字节:657222 (657.2 KB)
lo        Link encap:本地环回  
          inet 地址:127.0.0.1  掩码:255.0.0.0
          inet6 地址: ::1/128 Scope:Host
          UP LOOPBACK RUNNING  MTU:65536  跃点数:1
          接收数据包:304 错误:0 丢弃:0 过载:0 帧数:0
          发送数据包:304 错误:0 丢弃:0 过载:0 载波:0
          碰撞:0 发送队列长度:1000 
          接收字节:25426 (25.4 KB)  发送字节:25426 (25.4 KB)
 
这就是kni的实现原理,由两部分组成:
 (1)对外提供了一个字符设备,通过ioctl()操作。
 (2)底层是一个网口。
代码实现
- 定义全局的KNI变量:struct rte_kni *。
 - KNI初始化:rte_kni_init(…)。
 - 完善struct rte_kni_conf,用于写入内核中。
 - 完善struct rte_kni_ops。
 - 实现一个config_network_if类型的函数,用于网络的up、down操作。
 - 分配KNI,保存到全局变量中:rte_kni_alloc(…)。
 - 把包发送到内核中:rte_kni_tx_brust(…)。
 - 特别注意,要打开混杂模式:rte_eth_promiscuous_enable(…)。
 - 这里演示了把包发送到内核,并没有从内核中抓取返回的包发送出去。
 
(dpdk_udp.c)
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_mbuf.h>
#include <rte_kni.h>
#include <stdio.h>
#include <arpa/inet.h>
#define ENABLE_SEND	1
#define ENABLE_KNI	1
#define NUM_MBUFS (4096-1)
#define BURST_SIZE	32
int gDpdkPortId = 0; //
static const struct rte_eth_conf port_conf_default = {
	.rxmode = {.max_rx_pkt_len = RTE_ETHER_MAX_LEN }
};
#if ENABLE_KNI
struct rte_kni *global_kni = NULL;
#endif
#if ENABLE_SEND
// sender 
static uint32_t gSrcIp;
static uint32_t gDstIp;
static uint16_t gSrcPort;
static uint32_t gDstPort;
static uint8_t gSrcMac[RTE_ETHER_ADDR_LEN];
static uint8_t gDstMac[RTE_ETHER_ADDR_LEN];
#endif
//
static void ng_init_port(struct rte_mempool *mbuf_pool) {
	//1 count avail
	uint16_t nb_sys_ports= rte_eth_dev_count_avail(); //
	if (nb_sys_ports == 0) {
		rte_exit(EXIT_FAILURE, "No Supported eth found\n");
	}
	//1 
	struct rte_eth_dev_info dev_info;
	rte_eth_dev_info_get(gDpdkPortId, &dev_info); //
	//1 
	const int num_rx_queues = 1;
	const int num_tx_queues = 1;
	struct rte_eth_conf port_conf = port_conf_default;
	rte_eth_dev_configure(gDpdkPortId, num_rx_queues, num_tx_queues, &port_conf);
	//1 rx queue setup
	if (rte_eth_rx_queue_setup(gDpdkPortId, 0 , 1024, 
		rte_eth_dev_socket_id(gDpdkPortId),NULL, mbuf_pool) < 0) {
		rte_exit(EXIT_FAILURE, "Could not setup RX queue\n");
	}
#if ENABLE_SEND
	struct rte_eth_txconf txq_conf = dev_info.default_txconf;
	txq_conf.offloads = port_conf.rxmode.offloads;
	if (rte_eth_tx_queue_setup(gDpdkPortId, 0 , 1024, 
		rte_eth_dev_socket_id(gDpdkPortId), &txq_conf) < 0) {
		rte_exit(EXIT_FAILURE, "Could not setup TX queue\n");
	}
#endif
	//1 start
	if (rte_eth_dev_start(gDpdkPortId) < 0 ) {
		rte_exit(EXIT_FAILURE, "Could not start\n");
	}
	rte_eth_promiscuous_enable( gDpdkPortId);
}
#if ENABLE_SEND
static int ng_encode_udp_pkt(uint8_t *msg, unsigned char *data, uint16_t 
total_len) {
	// encode 
	// 1 ethhdr
	struct rte_ether_hdr *eth = (struct rte_ether_hdr *)msg;
	rte_memcpy(eth->s_addr.addr_bytes, gSrcMac, RTE_ETHER_ADDR_LEN);
	rte_memcpy(eth->d_addr.addr_bytes, gDstMac, RTE_ETHER_ADDR_LEN);
	eth->ether_type = htons(RTE_ETHER_TYPE_IPV4);
	
	// 2 iphdr 
	struct rte_ipv4_hdr *ip = (struct rte_ipv4_hdr *)(msg + sizeof(struct 
rte_ether_hdr));
	ip->version_ihl = 0x45;
	ip->type_of_service = 0;
	ip->total_length = htons(total_len - sizeof(struct rte_ether_hdr));
	ip->packet_id = 0;
	ip->fragment_offset = 0;
	ip->time_to_live = 64; // ttl = 64
	ip->next_proto_id = IPPROTO_UDP;
	ip->src_addr = gSrcIp;
	ip->dst_addr = gDstIp;
	
	ip->hdr_checksum = 0;
	ip->hdr_checksum = rte_ipv4_cksum(ip);
	// 3 udphdr 
	struct rte_udp_hdr *udp = (struct rte_udp_hdr *)(msg + sizeof(struct 
rte_ether_hdr) + sizeof(struct rte_ipv4_hdr));
	udp->src_port = gSrcPort;
	udp->dst_port = gDstPort;
	uint16_t udplen = total_len - sizeof(struct rte_ether_hdr) - sizeof(struct 
rte_ipv4_hdr);
	udp->dgram_len = htons(udplen);
	rte_memcpy((uint8_t*)(udp+1), data, udplen);
	udp->dgram_cksum = 0;
	udp->dgram_cksum = rte_ipv4_udptcp_cksum(ip, udp);
	struct in_addr addr;
	addr.s_addr = gSrcIp;
	printf(" --> src: %s:%d, ", inet_ntoa(addr), ntohs(gSrcPort));
	addr.s_addr = gDstIp;
	printf("dst: %s:%d\n", inet_ntoa(addr), ntohs(gDstPort));
	return 0;
}
static struct rte_mbuf * ng_send(struct rte_mempool *mbuf_pool, uint8_t *data
, uint16_t length) {
	// mempool --> mbuf
	const unsigned total_len = length + 42;
	struct rte_mbuf *mbuf = rte_pktmbuf_alloc(mbuf_pool);
	if (!mbuf) {
		rte_exit(EXIT_FAILURE, "rte_pktmbuf_alloc\n");
	}
	mbuf->pkt_len = total_len;
	mbuf->data_len = total_len;
	uint8_t *pktdata = rte_pktmbuf_mtod(mbuf, uint8_t*);
	ng_encode_udp_pkt(pktdata, data, total_len);
	return mbuf;
}
#endif
#if ENABLE_KNI
static int gconfig_network_if(uint16_t port_id, uint8_t if_up) {
	if (!rte_eth_dev_is_valid_port(port_id)) {
		return -EINVAL;
	}
	int ret = 0;
	if (if_up) { //
		rte_eth_dev_stop(port_id);
		ret = rte_eth_dev_start(port_id);
	} else {
		rte_eth_dev_stop(port_id);
	}
	if (ret < 0) {
		printf("Failed to start port : %d\n", port_id);
	}
	
	return 0;
}
#endif
int main(int argc, char *argv[]) {
	if (rte_eal_init(argc, argv) < 0) {
		rte_exit(EXIT_FAILURE, "Error with EAL init\n");
		
	}
	struct rte_mempool *mbuf_pool = rte_pktmbuf_pool_create("mbuf pool", 
NUM_MBUFS,
		0, 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id());
	if (mbuf_pool == NULL) {
		rte_exit(EXIT_FAILURE, "Could not create mbuf pool\n");
	}
#if ENABLE_KNI 
	rte_kni_init(gDpdkPortId);
#endif
	ng_init_port(mbuf_pool);
	
#if ENABLE_KNI
	struct rte_kni_conf conf;
	memset(&conf, 0, sizeof(conf));
	snprintf(conf.name, RTE_KNI_NAMESIZE, "vEth%d", gDpdkPortId);
	conf.group_id = gDpdkPortId;
	conf.mbuf_size = RTE_MBUF_DEFAULT_BUF_SIZE;
	//conf.
	rte_eth_macaddr_get(gDpdkPortId, (struct rte_ether_addr*)conf.mac_addr);
	rte_eth_dev_get_mtu(gDpdkPortId, &conf.mtu);
	struct rte_kni_ops ops;
	memset(&ops, 0, sizeof(ops));
	ops.port_id = gDpdkPortId;
	ops.config_network_if = gconfig_network_if;
	global_kni = rte_kni_alloc(mbuf_pool, &conf, &ops);
	
#endif
	while (1) {
		struct rte_mbuf *mbufs[BURST_SIZE];
		unsigned num_recvd = rte_eth_rx_burst(gDpdkPortId, 0, mbufs, BURST_SIZE);
		if (num_recvd > BURST_SIZE) {
			rte_exit(EXIT_FAILURE, "Error receiving from eth\n");
		}
		unsigned i = 0;
		for (i = 0;i < num_recvd;i ++) {
			struct rte_ether_hdr *ehdr = rte_pktmbuf_mtod(mbufs[i], struct 
rte_ether_hdr*);
			if (ehdr->ether_type != rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4)) {
				continue;
			}
			struct rte_ipv4_hdr *iphdr =  rte_pktmbuf_mtod_offset(mbufs[i], struct 
rte_ipv4_hdr *, 
				sizeof(struct rte_ether_hdr));
			
			if (iphdr->next_proto_id == IPPROTO_UDP) {
				struct rte_udp_hdr *udphdr = (struct rte_udp_hdr *)(iphdr + 1);
#if ENABLE_SEND		// echo
				// mac exchange
				rte_memcpy(gDstMac, ehdr->s_addr.addr_bytes, RTE_ETHER_ADDR_LEN);
				rte_memcpy(gSrcMac, ehdr->d_addr.addr_bytes, RTE_ETHER_ADDR_LEN);
				// ip exchange
				rte_memcpy(&gSrcIp, &iphdr->dst_addr, sizeof(uint32_t));
				rte_memcpy(&gDstIp, &iphdr->src_addr, sizeof(uint32_t));
				// port exchange
				rte_memcpy(&gSrcPort, &udphdr->dst_port, sizeof(uint16_t));
				rte_memcpy(&gDstPort, &udphdr->src_port, sizeof(uint16_t));
				
#endif
				uint16_t length = ntohs(udphdr->dgram_len);
				*((char*)udphdr + length) = '\0';
				struct in_addr addr;
				addr.s_addr = iphdr->src_addr;
				printf("src: %s:%d, ", inet_ntoa(addr), udphdr->src_port);
				addr.s_addr = iphdr->dst_addr;
				printf("dst: %s:%d, %s\n", inet_ntoa(addr), udphdr->src_port, 
					(char *)(udphdr+1));
#if ENABLE_SEND
				struct rte_mbuf *txbuf = ng_send(mbuf_pool, (unsigned char*)(udphdr+1), 
length);
				rte_eth_tx_burst(gDpdkPortId, 0, &txbuf, 1);
#endif
				rte_pktmbuf_free(mbufs[i]);
			} else {
				rte_kni_tx_burst(global_kni, &mbufs[i], 1);
			}
			
		}
	}
}
 
Makefle:
# binary name
APP = dpdk_udp
# all source are stored in SRCS-y
SRCS-y := dpdk_udp.c
# Build using pkg-config variables if possible
ifeq ($(shell pkg-config --exists libdpdk && echo 0),0)
all: shared
.PHONY: shared static
shared: build/$(APP)-shared
        ln -sf $(APP)-shared build/$(APP)
static: build/$(APP)-static
        ln -sf $(APP)-static build/$(APP)
PKGCONF=pkg-config --define-prefix
PC_FILE := $(shell $(PKGCONF) --path libdpdk)
CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
LDFLAGS_STATIC = -Wl,-Bstatic $(shell $(PKGCONF) --static --libs libdpdk)
build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
        $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
        $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
build:
        @mkdir -p $@
.PHONY: clean
clean:
        rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
        test -d build && rmdir -p build || true
else
ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif
# Default target, detect a build directory, by looking for a path with a .config
RTE_TARGET ?= $(notdir $(abspath $(dir $(firstword $(wildcard $(RTE_SDK)/*/.config)))))
include $(RTE_SDK)/mk/rte.vars.mk
 
总结
调试时,需要把/sys/devices/virtual/net/vEth0/carrier置为1。允许内核收发数据。
echo 1 > /sys/devices/virtual/net/vEth0/carrier
 




















