文本数据词汇级增强

news2025/5/17 9:45:53

import nltk

from nltk.corpus import wordnet

from nltk.tokenize import word_tokenize

import random



# nltk.download('wordnet')

# nltk.download('punkt')



def get_synonyms(word):

    """获取单词的同义词列表"""

    synonyms = []

    for syn in wordnet.synsets(word):

        for lemma in syn.lemmas():

            if lemma.name() != word:

                synonyms.append(lemma.name().replace('_', ' '))

    return list(set(synonyms))



def synonym_replacement(text, n=1):

    """对文本中的n个词进行同义词替换"""

    words = word_tokenize(text)

    words = [word for word in words if word.isalpha()]  # 只处理字母单词

    if len(words) < n:

        return text

   

    # 随机选择n个词进行替换

    random_words = random.sample(words, n)

    for random_word in random_words:

        synonyms = get_synonyms(random_word)

        if synonyms:

            synonym = random.choice(synonyms)

            text = text.replace(random_word, synonym)

   

    return text



def random_insertion(text, n=1):

    """随机插入n个词到文本中"""

    words = word_tokenize(text)

    words = [word for word in words if word.isalpha()]

    augmented_text = text

   

    for _ in range(n):

        if not words:

            break

           

        # 随机选择一个词并获取其同义词

        random_word = random.choice(words)

        synonyms = get_synonyms(random_word)

        if synonyms:

            synonym = random.choice(synonyms)

            # 随机选择插入位置

            insertion_idx = random.randint(0, len(words))

            words.insert(insertion_idx, synonym)

            augmented_text = " ".join(words)

   

    return augmented_text




def random_swap(text, n=1):

    """随机交换文本中n对词的位置"""

    words = word_tokenize(text)

    words = [word for word in words if word.isalpha()]

    augmented_words = words.copy()

   

    for _ in range(n):

        if len(words) < 2:

            break

           

        # 随机选择两个位置

        idx1, idx2 = random.sample(range(len(words)), 2)

        # 交换位置

        augmented_words[idx1], augmented_words[idx2] = augmented_words[idx2], augmented_words[idx1]

   

    return " ".join(augmented_words)




def random_deletion(text, p=0.1):

    """以概率p随机删除文本中的词"""

    words = word_tokenize(text)

    if not words:

        return text

   

    # 决定每个词是否保留

    remaining_words = [word for word in words if random.uniform(0, 1) > p]

    if not remaining_words:

        # 至少保留一个词

        return random.choice(words)

   

    return " ".join(remaining_words)



def aug(text, method=[]):

    for m in method:

        if m == 're':

            text = synonym_replacement(text, n=int(len(text.split(" "))/3))

            print("re:",int(len(text.split(" "))/3))

        elif m == 'in':

            text = random_insertion(text, n=int(len(text.split(" "))/4))

            print("in:",int(len(text.split(" "))/4))

        elif m == "swap":

            text = random_swap(text, n=2)

        elif m == "delet":

            text = random_deletion(text, p=0.2)

    return text



if __name__=='__main__':

    text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."

    print(len(text.split(" ")))

    # augmented_text = synonym_replacement(text, n=20)

    # inserted_text = random_insertion(augmented_text, n=10)

    # swap_text = random_swap(inserted_text, n=2)

    # delet_text = random_deletion(swap_text, p=0.2)

    final_text = aug(text, ["re", "in", "swap"])

    print(f"原始文本: {text}")

   

    print(f"增强后文本: {final_text}")

使用nlpaug库

import nlpaug.augmenter.word as naw

# 同义词替换增强器（基于WordNet）
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)

text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."
augmented_text = aug.augment(text)
if text == augmented_text:
    print(1)
print(f"原始文本: {text}")
print(f"增强后文本: {augmented_text}")

# 使用词向量进行同义词替换
# aug = naw.WordEmbsAug(
#     model_type='word2vec', model_path='path/to/word2vec.bin',
#     action="substitute")

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.coloradmin.cn/o/2377545.html

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈，一经查实，立即删除！