2015-06-14 2 views
1

Я принял этот код от https://github.com/davidadamojr/TextRank, и перед вами стоит эта проблема. Я попытался решить, разместив utf-8 в «keyphrases = decode ('utf-8'). extractKeyphrases (текст)«, но не удалось.UnicodeDecodeError: кодек 'ascii' не может декодировать байт 0xe2 в позиции 4: порядковый номер не в диапазоне (128)

вот код:

""" 
From this paper: http://acl.ldc.upenn.edu/acl2004/emnlp/pdf/Mihalcea.pdf 

External dependencies: nltk, numpy, networkx 

Based on https://gist.github.com/voidfiles/1646117 
""" 

import nltk 
import itertools 
from operator import itemgetter 
import networkx as nx 
import sys 
import os 

#apply syntactic filters based on POS tags 
def filter_for_tags(tagged, tags=['NN', 'JJ', 'NNP']): 
    return [item for item in tagged if item[1] in tags] 

def normalize(tagged): 
    return [(item[0].replace('.', ''), item[1]) for item in tagged] 

def unique_everseen(iterable, key=None): 
    "List unique elements, preserving order. Remember all elements ever seen." 
    # unique_everseen('AAAABBBCCDAABBB') --> A B C D 
    # unique_everseen('ABBCcAD', str.lower) --> A B C D 
    seen = set() 
    seen_add = seen.add 
    if key is None: 
     for element in itertools.ifilterfalse(seen.__contains__, iterable): 
      seen_add(element) 
      yield element 
    else: 
     for element in iterable: 
      k = key(element) 
      if k not in seen: 
       seen_add(k) 
       yield element 

def lDistance(firstString, secondString): 
    "Function to find the Levenshtein distance between two words/sentences - gotten from http://rosettacode.org/wiki/Levenshtein_distance#Python" 
    if len(firstString) > len(secondString): 
     firstString, secondString = secondString, firstString 
    distances = range(len(firstString) + 1) 
    for index2, char2 in enumerate(secondString): 
     newDistances = [index2 + 1] 
     for index1, char1 in enumerate(firstString): 
      if char1 == char2: 
       newDistances.append(distances[index1]) 
      else: 
       newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) 
     distances = newDistances 
    return distances[-1] 

def buildGraph(nodes): 
    "nodes - list of hashables that represents the nodes of the graph" 
    gr = nx.Graph() #initialize an undirected graph 
    gr.add_nodes_from(nodes) 
    nodePairs = list(itertools.combinations(nodes, 2)) 

    #add edges to the graph (weighted by Levenshtein distance) 
    for pair in nodePairs: 
     firstString = pair[0] 
     secondString = pair[1] 
     levDistance = lDistance(firstString, secondString) 
     gr.add_edge(firstString, secondString, weight=levDistance) 

    return gr 

def extractKeyphrases(text): 
    #tokenize the text using nltk 
    wordTokens = nltk.word_tokenize(text) 

    #assign POS tags to the words in the text 
    tagged = nltk.pos_tag(wordTokens) 
    textlist = [x[0] for x in tagged] 

    tagged = filter_for_tags(tagged) 
    tagged = normalize(tagged) 

    unique_word_set = unique_everseen([x[0] for x in tagged]) 
    word_set_list = list(unique_word_set) 

    #this will be used to determine adjacent words in order to construct keyphrases with two words 

    graph = buildGraph(word_set_list) 

    #pageRank - initial value of 1.0, error tolerance of 0,0001, 
    calculated_page_rank = nx.pagerank(graph, weight='weight') 

    #most important words in ascending order of importance 
    keyphrases = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True) 

    #the number of keyphrases returned will be relative to the size of the text (a third of the number of vertices) 
    aThird = len(word_set_list)/3 
    keyphrases = keyphrases[0:aThird+1] 

    #take keyphrases with multiple words into consideration as done in the paper - if two words are adjacent in the text and are selected as keywords, join them 
    #together 
    modifiedKeyphrases = set([]) 
    dealtWith = set([]) #keeps track of individual keywords that have been joined to form a keyphrase 
    i = 0 
    j = 1 
    while j < len(textlist): 
     firstWord = textlist[i] 
     secondWord = textlist[j] 
     if firstWord in keyphrases and secondWord in keyphrases: 
      keyphrase = firstWord + ' ' + secondWord 
      modifiedKeyphrases.add(keyphrase) 
      dealtWith.add(firstWord) 
      dealtWith.add(secondWord) 
     else: 
      if firstWord in keyphrases and firstWord not in dealtWith: 
       modifiedKeyphrases.add(firstWord) 

      #if this is the last word in the text, and it is a keyword, 
      #it definitely has no chance of being a keyphrase at this point  
      if j == len(textlist)-1 and secondWord in keyphrases and secondWord not in dealtWith: 
       modifiedKeyphrases.add(secondWord) 

     i = i + 1 
     j = j + 1 

    return modifiedKeyphrases 

def extractSentences(text): 
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 
    sentenceTokens = sent_detector.tokenize(text.strip()) 
    graph = buildGraph(sentenceTokens) 

    calculated_page_rank = nx.pagerank(graph, weight='weight') 

    #most important sentences in ascending order of importance 
    sentences = sorted(calculated_page_rank, key=calculated_page_rank.get, reverse=True) 

    #return a 100 word summary 
    summary = ' '.join(sentences) 
    summaryWords = summary.split() 
    summaryWords = summaryWords[0:101] 
    summary = ' '.join(summaryWords) 

    return summary 

def writeFiles(summary, keyphrases, fileName): 
    "outputs the keyphrases and summaries to appropriate files" 
    print "Generating output to " + 'keywords/' + fileName 
    keyphraseFile = open('keywords/' + fileName, 'w') 
    for keyphrase in keyphrases: 
     keyphraseFile.write(keyphrase + '\n') 
    keyphraseFile.close() 

    print "Generating output to " + 'summaries/' + fileName 
    summaryFile = open('summaries/' + fileName, 'w') 
    summaryFile.write(summary) 
    summaryFile.close() 

    print "-" 


#retrieve each of the articles 
articles = os.listdir("articles") 
for article in articles: 
    print 'Reading articles/' + article 
    articleFile = open('articles/' + article, 'r') 
    text = articleFile.read() 
    keyphrases = decode('utf-8').extractKeyphrases(text) 
    summary = extractSentences(text) 
    writeFiles(summary, keyphrases, article) 

ошибка:

Reading articles/1.txt 

Traceback (most recent call last): 
    File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 166, in <module> 
    keyphrases = extractKeyphrases(text).setdefaultencoding("utf-8") 
    File "C:\Users\DELL\Desktop\python\s\fyp\Relevancy\test\TextRank-master\textrank.py", line 72, in extractKeyphrases 
    wordTokens = nltk.word_tokenize(text) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 93, in word_tokenize 
    return [token for sent in sent_tokenize(text) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\__init__.py", line 82, in sent_tokenize 
    return tokenizer.tokenize(text) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1270, in tokenize 
    return list(self.sentences_from_text(text, realign_boundaries)) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1318, in sentences_from_text 
    return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1309, in span_tokenize 
    return [(sl.start, sl.stop) for sl in slices] 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1348, in _realign_boundaries 
    for sl1, sl2 in _pair_iter(slices): 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter 
    prev = next(it) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1324, in _slices_from_text 
    if self.text_contains_sentbreak(context): 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1369, in text_contains_sentbreak 
    for t in self._annotate_tokens(self._tokenize_words(text)): 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 1504, in _annotate_second_pass 
    for t1, t2 in _pair_iter(tokens): 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 354, in _pair_iter 
    prev = next(it) 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 621, in _annotate_first_pass 
    for aug_tok in tokens: 
    File "C:\Python27\lib\site-packages\nltk-3.0.1-py2.7-win32.egg\nltk\tokenize\punkt.py", line 586, in _tokenize_words 
    for line in plaintext.split('\n'): 
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 4: ordinal not in range(128) 

Любая идея? (Простите за плохой английский)

ответ

3

Я думаю, что вы ищете:

# ... 
text = articleFile.read().decode('utf-8') 
keyphrases = extractKeyphrases(text) 
# ... 

Основном вы хотите декодировать в строку Юникода содержимое файла, как только вы его прочитали. Тогда остальная часть вашей программы будет спасена от проблем с конверсией. Также убедитесь, что файл действительно находится в кодировке utf-8. Если вы не уверены, попробуйте latin1 в качестве кодировки, потому что это никогда не будет генерировать исключение при декодировании (но, тем не менее, при неправильном тексте текста, если файл не находится в кодировке latin1)

+0

Да, спасибо большое – user3162878

Смежные вопросы

 Смежные вопросы