import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk import pos_tag
deftext_preprocessing(text): try: ifnot text ornotisinstance(text, str): raise ValueError("输入文本不能为空且必须为字符串格式。")
print("===== 1. 分句结果 =====") sentences = sent_tokenize(text) for i, sent inenumerate(sentences): print(f"句子{i+1}: {sent}")
print("\n===== 2. 分词结果 =====") all_tokens = [] for sent in sentences: tokens = word_tokenize(sent) all_tokens.extend(tokens) print(f"「{sent}」-> {tokens}")
print("\n===== 3. 词性标注结果 (前10个) =====") tagged_tokens = pos_tag(all_tokens) for word, tag in tagged_tokens[:10]: print(f"{word} -> {tag}")
print("\n===== 4. 停用词过滤结果 =====") stop_words = set(stopwords.words('english')) # 过滤掉标点符号和停用词,并转为小写以匹配 filtered_tokens = [w.lower() for w in all_tokens if w.isalpha() and w.lower() notin stop_words] print(f"过滤前词数: {len(all_tokens)}, 过滤后: {len(filtered_tokens)}") print(filtered_tokens)
print("\n===== 5. 词形还原结果 (拓展) =====") lemmatizer = WordNetLemmatizer() # 注意:简单的词形还原默认按名词处理,这里演示基础用法 lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens] print(lemmatized_tokens)
except Exception as e: print(f"代码运行出错: {e}")
raw_text = """Natural language processing (NLP) is a subfield of artificial intelligence. It focuses on enabling computers to understand and process human language. NLTK is a popular library for NLP in Python, which provides many useful tools."""
if __name__ == "__main__": text_preprocessing(raw_text)
逐块精析
模块导入
1 2 3 4 5
import nltk from nltk.tokenize import sent_tokenize, word_tokenize # 导入分句和分词工具 from nltk.corpus import stopwords # 导入停用词词库 from nltk.stem import WordNetLemmatizer # 导入词形还原工具 from nltk import pos_tag # 导入词性标注工具
print("\n===== 3. 词性标注结果 (前10个) =====") tagged_tokens = pos_tag(all_tokens) for word, tag in tagged_tokens[:10]: print(f"{word} -> {tag}")
pos_tag: 对 all_tokens 中的每个单词分配一个词性标签。
常见标签: JJ (形容词), NN (名词), VBZ (动词单三), IN (介词)。
步骤 4:停用词过滤 (Stopwords Removal)
1 2 3 4 5 6 7
print("\n===== 4. 停用词过滤结果 =====") stop_words = set(stopwords.words('english')) filtered_tokens = [w.lower() for w in all_tokens if w.isalpha() and w.lower() notin stop_words] print(f"过滤前词数: {len(all_tokens)}, 过滤后: {len(filtered_tokens)}") print(filtered_tokens)
w.lower(): 统一转为小写,因为停用词表全是小写。
w.isalpha(): 检查字符是否全是字母。它会自动过滤掉 ( ) . , 等标点符号。
not in stop_words: 剔除掉无意义的常用词。
步骤 5:词形还原 (Lemmatization)
1 2 3 4 5
print("\n===== 5. 词形还原结果 (拓展) =====") lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens] print(lemmatized_tokens)