import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random
# nltk.download('wordnet')
# nltk.download('punkt')
def get_synonyms(word):
"""获取单词的同义词列表"""
synonyms = []
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
if lemma.name() != word:
synonyms.append(lemma.name().replace('_', ' '))
return list(set(synonyms))
def synonym_replacement(text, n=1):
"""对文本中的n个词进行同义词替换"""
words = word_tokenize(text)
words = [word for word in words if word.isalpha()] # 只处理字母单词
if len(words) < n:
return text
# 随机选择n个词进行替换
random_words = random.sample(words, n)
for random_word in random_words:
synonyms = get_synonyms(random_word)
if synonyms:
synonym = random.choice(synonyms)
text = text.replace(random_word, synonym)
return text
def random_insertion(text, n=1):
"""随机插入n个词到文本中"""
words = word_tokenize(text)
words = [word for word in words if word.isalpha()]
augmented_text = text
for _ in range(n):
if not words:
break
# 随机选择一个词并获取其同义词
random_word = random.choice(words)
synonyms = get_synonyms(random_word)
if synonyms:
synonym = random.choice(synonyms)
# 随机选择插入位置
insertion_idx = random.randint(0, len(words))
words.insert(insertion_idx, synonym)
augmented_text = " ".join(words)
return augmented_text
def random_swap(text, n=1):
"""随机交换文本中n对词的位置"""
words = word_tokenize(text)
words = [word for word in words if word.isalpha()]
augmented_words = words.copy()
for _ in range(n):
if len(words) < 2:
break
# 随机选择两个位置
idx1, idx2 = random.sample(range(len(words)), 2)
# 交换位置
augmented_words[idx1], augmented_words[idx2] = augmented_words[idx2], augmented_words[idx1]
return " ".join(augmented_words)
def random_deletion(text, p=0.1):
"""以概率p随机删除文本中的词"""
words = word_tokenize(text)
if not words:
return text
# 决定每个词是否保留
remaining_words = [word for word in words if random.uniform(0, 1) > p]
if not remaining_words:
# 至少保留一个词
return random.choice(words)
return " ".join(remaining_words)
def aug(text, method=[]):
for m in method:
if m == 're':
text = synonym_replacement(text, n=int(len(text.split(" "))/3))
print("re:",int(len(text.split(" "))/3))
elif m == 'in':
text = random_insertion(text, n=int(len(text.split(" "))/4))
print("in:",int(len(text.split(" "))/4))
elif m == "swap":
text = random_swap(text, n=2)
elif m == "delet":
text = random_deletion(text, p=0.2)
return text
if __name__=='__main__':
text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."
print(len(text.split(" ")))
# augmented_text = synonym_replacement(text, n=20)
# inserted_text = random_insertion(augmented_text, n=10)
# swap_text = random_swap(inserted_text, n=2)
# delet_text = random_deletion(swap_text, p=0.2)
final_text = aug(text, ["re", "in", "swap"])
print(f"原始文本: {text}")
print(f"增强后文本: {final_text}")
使用nlpaug库
import nlpaug.augmenter.word as naw
# 同义词替换增强器(基于WordNet)
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)
text = "Advanced data analytics enables organizations to extract actionable insights from large datasets, leveraging machine learning algorithms and statistical models to optimize decision-making, enhance operational efficiency, and predict market trends with high accuracy, ultimately driving competitive advantage through data-driven strategies in industries such as finance, healthcare, and e-commerce."
augmented_text = aug.augment(text)
if text == augmented_text:
print(1)
print(f"原始文本: {text}")
print(f"增强后文本: {augmented_text}")
# 使用词向量进行同义词替换
# aug = naw.WordEmbsAug(
# model_type='word2vec', model_path='path/to/word2vec.bin',
# action="substitute")