2015-11-25 2 views
1

Я использую большую библиотеку scikit-learn, применяя lda/nmf в моем наборе данных.Тема моделирования nmf/lda scikit-learn

from __future__ import print_function 
from time import time 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation 
from sklearn.datasets import fetch_20newsgroups 

n_samples = 2000 
n_features = 1000 
n_topics = 5 
n_top_words = 5 


def print_top_words(model, feature_names, n_top_words): 
    for topic_idx, topic in enumerate(model.components_): 
     print("TopiC#%d:" % topic_idx) 
     print(" ".join([feature_names[i] 
         for i in topic.argsort()[:-n_top_words - 1:-1]])) 
    print() 


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics 
# to filter out useless terms early on: the posts are stripped of headers, 
# footers and quoted replies, and common English words, words occurring in 
# only one document or in at least 95% of the documents are removed. 

print("Loading dataset...") 
t0 = time() 
dataset = fetch_20newsgroups(shuffle=True, random_state=1, 
          remove=('headers', 'footers', 'quotes')) 
data_samples = dataset.data 
print("done in %0.3fs." % (time() - t0)) 

# Use tf-idf features for NMF. 
print("Extracting tf-idf features for NMF...") 
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features, 
            stop_words='english') 
t0 = time() 
tfidf = tfidf_vectorizer.fit_transform(data_samples) 
print("done in %0.3fs." % (time() - t0)) 

# Use tf (raw term count) features for LDA. 
print("Extracting tf features for LDA...") 
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, 
           stop_words='english') 
t0 = time() 
tf = tf_vectorizer.fit_transform(data_samples) 
print("done in %0.3fs." % (time() - t0)) 

# Fit the NMF model 
print("Fitting the NMF model with tf-idf features," 
     "n_samples=%d and n_features=%d..." 
     % (n_samples, n_features)) 
t0 = time() 
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) 
print("done in %0.3fs." % (time() - t0)) 

print("\nTopics in NMF model:") 
tfidf_feature_names = tfidf_vectorizer.get_feature_names() 
print_top_words(nmf, tfidf_feature_names, n_top_words) 

print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." 
     % (n_samples, n_features)) 
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, 
           learning_method='online', learning_offset=50., 
           random_state=0) 
t0 = time() 
lda.fit(tf) 
print("done in %0.3fs." % (time() - t0)) 

print("\nTopics in LDA model:") 
tf_feature_names = tf_vectorizer.get_feature_names() 
print_top_words(lda, tf_feature_names, n_top_words) 

где в наборе данных = fetch_20newsgroups Даю наборы данных, который является списком с темами. Программа хорошо работает и выходные темы (NMF/LDA) в виде обычного текста, как здесь:

Topics in NMF model: 
TopiC#0: 
don people just think like 
TopiC#1: 
windows thanks card file dos 
TopiC#2: 
drive scsi ide drives disk 
TopiC#3: 
god jesus bible christ faith 
TopiC#4: 
geb dsl n3jxp chastity cadre 

Как я могу представить там результаты? Я не могу понять векторный/математический код, стоящий за реализацией. Есть ли способ визуализировать вывод с графиками? мешок слов тоже? Меня интересуют только результаты nmf. Я очень плохо разбираюсь в вещах.

+0

Посмотрите визуализации результатов тема модели [здесь] (http://stackoverflow.com/questions/30397550/visualizing-an-lda- модель-используя-Python) –

ответ

Смежные вопросы