nltk lib use

简介

NLTK（Natural Language Toolkit，中文叫做自然语言工具包）是一个用于处理和分析人类语言数据（自然语言）的Python库。它为自然语言处理（NLP）提供了许多有用的工具和资源，广泛用于学术研究、机器学习、文本分析等领域。

安装使用

1
2
3


pip install nltk

安装资源

import nltk
nltk.download('punkt')  # 下载用于标记化的资源
nltk.download('stopwords')  # 下载停用词列表

功能

文本处理和标记化（Tokenization）：

将文本分解为更小的单位，如单词、句子等。例如，把一句话分成单词列表。

import nltk
from nltk.tokenize import word_tokenize
text = "Hello, world!"
words = word_tokenize(text)
print(words)  # 输出: ['Hello', ',', 'world', '!']

词性标注（POS Tagging）：

给每个单词打上一个词性标签（如名词、动词、形容词等），帮助理解单词在句中的语法作用。

from nltk.tokenize import word_tokenize
from nltk import pos_tag
sentence = "I am learning Python."
words = word_tokenize(sentence)
tagged = pos_tag(words)
print(tagged)
# 输出: [('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Python', 'NNP')]

命名实体识别（NER, Named Entity Recognition）：

NLTK能够识别文本中的命名实体，如人名、地名、组织名等。

from nltk import ne_chunk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

sentence = "Barack Obama was born in Hawaii."
words = word_tokenize(sentence)
tagged = pos_tag(words)
named_entities = ne_chunk(tagged)
print(named_entities)
# 输出: (S (PERSON Barack/NNP Obama/NNP) was/VBD born/VBN in/IN Hawaii/NNP ./.)

词汇和语料库（Corpora）：
- NLTK提供了许多内置的语料库（corpora），这些语料库包含大量的文本数据，可以用于语言模型训练、文本分析等。例如，punkt语料库用于句子分割，stopwords语料库提供了常见的停用词（如“the”，“is”等），可以在文本处理时忽略这些词。
1
2
3
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words) # 输出: {'the', 'and', 'is', 'in', 'of', 'to', ...}
词干提取（Stemming）和词形还原（Lemmatization）：
- 这两个技术用于将单词还原为其基础形式（如将“running”变为“run”）。
- 词干提取（Stemming）是通过简单的规则来去除词尾，而词形还原（Lemmatization）则考虑上下文，使用字典和规则。
1
2
3
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem("running")) # 输出: run
文本分类（Text Classification）：
- NLTK支持构建和训练文本分类器，适用于垃圾邮件分类、情感分析等任务。
句法分析（Parsing）：
- NLTK支持各种句法分析技术，包括上下文无关文法（CFG）和依存句法分析，帮助理解句子的结构。
机器翻译和词向量（Word Embeddings）：
- NLTK也可以与其他库（如Gensim）结合，用于词向量模型和机器翻译等任务。

demo

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy
from collections import Counter
from textblob import TextBlob
import re
import os

from src.txt_decode.txt_decode_new import en_model_path

# 下载必要的NLTK数据
nltk.download('punkt')
nltk.download('stopwords')

# 加载Spacy英文模型
nlp = spacy.load(en_model_path)

def basic_text_features(script):
    # 分词
    words = word_tokenize(script)
    # 去除停用词
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # 词频统计
    word_freq = Counter(filtered_words)
    return word_freq

def syntax_features(script):
    doc = nlp(script)
    pos_tags = [(token.text, token.pos_) for token in doc]
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return pos_tags, dependencies

def tfidf_features(scripts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(scripts)
    return X, vectorizer.get_feature_names_out()

def topic_modeling(scripts, num_topics=2):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(scripts)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
    lda.fit(X)
    return lda, vectorizer

def sentiment_analysis(script):
    blob = TextBlob(script)
    sentiment = blob.sentiment
    return sentiment.polarity, sentiment.subjectivity

def dialogue_features(script):
    # 改进正则表达式，适应中文和英文引号
    dialogues = re.findall(r'["“”](.*?)[”"]', script)
    return dialogues

def read_script(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File {file_path} not found.")
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except PermissionError:
        raise PermissionError(f"Permission denied to read the file {file_path}. Please check file permissions.")

if __name__ == "__main__":
    file_path = r'D:\pythonProject\LE-SEO.doc'  
    try:
        script = read_script(file_path)

        # 基本文本特征提取
        word_freq = basic_text_features(script)
        print("Basic Text Features (Word Frequency):")
        print(word_freq)

        # 语法特征提取
        pos_tags, dependencies = syntax_features(script)
        print("\nSyntax Features (POS Tags and Dependencies):")
        print("POS Tags:", pos_tags)
        print("Dependencies:", dependencies)

        # 语义特征提取（TF-IDF）
        scripts_list = [script]  # 如果需要分析多个脚本，可以在这里添加
        X, feature_names = tfidf_features(scripts_list)
        print("\nTF-IDF Features:")
        print(X.toarray())
        print("Feature Names:", feature_names)

        # 主题建模（LDA）
        lda, vectorizer = topic_modeling(scripts_list)
        print("\nTopic Modeling (LDA):")
        # 打印每个主题的前几个单词
        for topic_idx, topic in enumerate(lda.components_):
            print(f"Topic #{topic_idx}:")
            print([feature_names[i] for i in topic.argsort()[:-10 - 1:-1]])

        # 情感分析
        polarity, subjectivity = sentiment_analysis(script)
        print("\nSentiment Analysis:")
        print(f"Polarity: {polarity}, Subjectivity: {subjectivity}")

        # 对话特征提取
        dialogues = dialogue_features(script)
        print("\nDialogue Features:")
        print(dialogues)

    except Exception as e:
        print(f"An error occurred: {e}")