数据结构 ——前缀树查词典的实现

news2026/2/2 16:07:52

数据结构 ——前缀树查词典的实现

一、前缀树的概念

前缀树是一种多叉树结构，主要用于存储字符串。每个节点代表一个字符，路径从根节点到叶节点表示一个完整的字符串。前缀树的关键特征是共享前缀，也就是说，如果两个字符串有相同的前缀，它们会共享前缀部分的节点。
优点：由于共享前缀，前缀树能有效地节省空间，并且支持快速查找、插入、删除操作，尤其是在处理大量字符串时非常高效。
应用场景：主要用于实现高效的字符串查找、自动补全、词典查询等。

二、查词典的代码实现

在这里插入图片描述
插入key:ant过程，查找单词同插入差不多

在这里插入图片描述
插入key:donkey

下面是利用前缀树在一个给定的文件log.txt,来实现一个简单的查词典功能

//log,txt
ant:a samll insect that lives in group
butterfly:a flying insect with a long thin body
cobra:a highly dangerous snake
donkey: a animal with short legs and long ears

//利用前缀数，根据提供的文件实现一个简单的查词典功能
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define DESC_SIZE  256
#define KEY_SIZE   256
#define BUFSIZE    512
#define FNAME      "log.txt"
struct node_st
{
    struct node_st *ch[26];//存放26个字母
    char desc[DESC_SIZE];//单词的描述
};
//根据：拆分关键字和描述  donkey: a animal with short legs and long ears
int get_word(FILE *fp, char *key, char *desc)
{
    char buf[BUFSIZE];//用于存放读出来的一行的字符
    char *retp;
    int i,j;
    retp=fgets(buf,BUFSIZE,fp);//从fp文件，读取BUFSIZE个字符，存到buf中 
    //文件读取失败
    if (retp==NULL)
       return -1; 
    //把关键字和描述分开  KEY_SIZE-1是预留一个尾0
    for(i=0;i<KEY_SIZE-1 && buf[i]!=':';i++) 
        key[i]=buf[i];
    key[i]='\0';
    i++;
    for(j=0;j<DESC_SIZE-1 && buf[i]!='\0';j++,i++)
        desc[j]=buf[i];
    desc[j]='\0';
    return 0;  
}
struct node_st *newnode()
{
    struct node_st *node;
    int i;
    node=malloc(sizeof(*node));//申请的是 struct node_st的内存,指针内存是struct node_st *
    if(node==NULL)
        return NULL;
    //初始化一个指针数组ch和描述字符串desc
    node->desc[0]='\0';//字符串是以空字符结尾的字符数组,将第一个字符设置为 '\0' 实际创建了一个空字符串
    for(i=0;i<26;i++)
        node->ch[i]=NULL;
    return node;
}
int insert(struct node_st **root, char *key, char *desc)
{
    if(*root==NULL)
    {
        *root=newnode();
        if(*root==NULL)
            return -1;
    }
    if(*key=='\0')
    {
        strcpy((*root)->desc,desc);
        return 0;
    }
   /*注意(*root)->ch+*key-'a'和root->ch[*key-'a']的区别
   struct node_st *ch[26]定义一个指针数组，ch指向的是一个结构体数组，数组的每一个元素存的是指针值 ch[1]=*(ch+1) 表示的是这个数组首地址偏移一个位置的解引用 *&，访问的是这个偏移一个位置后的数据，且该数据存的是一个指针，为一级指针
   ch+1 则是一个二级指针，表示指向的是这个指针数组首地址偏移一个位置后的地址
   */
    return insert((*root)->ch+*key-'a',key+1,desc);//

}
char *find(struct node_st *root, char *key)
{
    if(root==NULL)
        return NULL;
    if(*key=='\0')
        return root->desc;
    return find(root->ch[*key-'a'],key+1);
    
}
int main() 
{
    FILE *fp;
    struct node_st *tree=NULL;
    char desc[DESC_SIZE]={'\0'}, key[KEY_SIZE]={'\0'};
    int ret;
    char *datap;
    fp=fopen(FNAME,"r");
    if(fp==NULL)
    {
        fprintf(stderr,"fopen():error\n");
        return -1;
    }
    while (1)
    {
        //拆单词
        ret=get_word(fp,key,desc);
        if(ret==-1)
            break;
        //测试key和desc是否分开
        // puts(key);
        // puts(desc);
        insert(&tree,key,desc);
    }
    // datap=find(tree,"donkey");
    datap=find(tree,"hello");
    if(datap==NULL)
        printf("Can not found!\n");
    else
        puts(datap);
    fclose(fp);
    return 0;
}