2013-11-10 6 views
-2

Цель этой программы - сопоставить подпись авторов (стиль письма) с их частями, чтобы выяснить, кто такой автор. Я в основном работает с программой, но когда я иду запускать ее, кроме первого приглашения, где он запрашивает имя файла, он ничего не делает. Я не могу понять, почему он не движется с файлом. Это дает мне сообщение «Этот файл не найден», но файл IS находится в правильном каталоге, поэтому я не знаю, что происходит.Программа выполнена, но не работает

Довольно большая программа относительно говоря, поэтому я не ожидаю, что люди действительно углублятся в глубину, но я просто не могу понять это сам.

import os.path, math 

def clean_up(s): 
    ''' Return a version of string str in which all letters have been 
    converted to lowercase and punctuation characters have been stripped 
    from both ends. Inner punctuation is left untouched. ''' 

    punctuation = '''!"',;:.-?)([]<>*#\n\t\r''' 
    result = s.lower().strip(punctuation) 
    return result 


def average_word_length(text): 
    ''' Return the average length of all words in text. Do not 
    include surrounding punctuation in words. 
    text is a non-empty list of strings each ending in \n. 
    At least one line in text contains a word.''' 

    words = text.split() 
    for word in words: 
      average=sum(len(word) for word in words)/len(words)   
    return average 


def type_token_ratio(text): 
    ''' Return the type token ratio (TTR) for this text. 
    TTR is the number of different words divided by the total number of words. 
    text is a non-empty list of strings each ending in \n. 
    At least one line in text contains a word. ''' 
    uniquewords=dict() 
    words=0 
    for line in text.splitlines(): 
     line=line.strip().split() 
     for word in line: 
      words+=1 
      if word in uniquewords: 
       uniquewords[word]+=1 
      else: 
       uniquewords[word]=1 
    TTR= len(uniquewords)/words 
    return TTR 


def hapax_legomana_ratio(text): 
    ''' Return the hapax_legomana ratio for this text. 
    This ratio is the number of words that occur exactly once divided 
    by the total number of words. 
    text is a list of strings each ending in \n. 
    At least one line in text contains a word.''' 

    uniquewords = dict() 
    words = 0 
    for line in text.splitlines(): 
     line = line.strip().split() 
     for word in line: 
      words += 1 
      word = word.replace(',', '').strip() 
      if word in uniquewords: 
       uniquewords[word] -= 1 
      else: 
       uniquewords[word] = 1 

    unique_count = 0 
    for each in uniquewords: 
     if uniquewords[each] == 1: 
      unique_count += 1 
    HLR = unique_count/words 

    return HLR  


def split_on_separators(original, separators): 
    ''' Return a list of non-empty, non-blank strings from the original string 
    determined by splitting the string on any of the separators. 
    separators is a string of single-character separators.''' 


    result = [] 
    newstring='' 

    for char in original: 
     if char in separators: 
      result.append(newstring) 
      newstring='' 
      if '' in result: 
       result.remove('') 
     else: 
      newstring+=char 
    return result 

def average_sentence_length(text): 
    ''' Return the average number of words per sentence in text. 
    text is guaranteed to have at least one sentence. 
    Terminating punctuation defined as !?. 
    A sentence is defined as a non-empty string of non-terminating 
    punctuation surrounded by terminating punctuation 
    or beginning or end of file. ''' 
    words=0 
    Sentences=0 
    for line in text.split(): 
     words+=1 
    sentence=split_on_separators(text,'?!.') 
    for sep in sentence: 
     Sentences+=1 

    ASL=words/Sentences 
    return ASL 


def avg_sentence_complexity(text): 
    '''Return the average number of phrases per sentence. 
    Terminating punctuation defined as !?. 
    A sentence is defined as a non-empty string of non-terminating 
    punctuation surrounded by terminating punctuation 
    or beginning or end of file. 
    Phrases are substrings of a sentences separated by 
    one or more of the following delimiters ,;: ''' 
    Sentences=0 
    Phrases=0 
    sentence=split_on_separators(text,'?!.') 
    for sep in sentence: 
     Sentences+=1 
    Phrase=split_on_separators(text, ',;:') 
    for n in Phrase: 
     Phrases+=1 
    ASC=Phrases/Sentences 
    return ASC 


def get_valid_filename(prompt): 
    '''Use prompt (a string) to ask the user to type the name of a file. If 
    the file does not exist, keep asking until they give a valid filename. 
    Return the name of that file.''' 


    filename = input(prompt) 
    if os.path.isfile(filename) == False: 
     print ("That file does not exist.") 
     filename = input(prompt) 

    return filename 


def read_directory_name(prompt): 
    '''Use prompt (a string) to ask the user to type the name of a directory. If 
    the directory does not exist, keep asking until they give a valid directory. 
    ''' 

    dirname = input(prompt) 
    if os.path.isfile(dirname)== False: 
     print ("That directory does not exist.") 
     dirname=input(prompt) 
    return dirname 


def compare_signatures(sig1, sig2, weight): 
    '''Return a non-negative real number indicating the similarity of two 
    linguistic signatures. The smaller the number the more similar the 
    signatures. Zero indicates identical signatures. 
    sig1 and sig2 are 6 element lists with the following elements 
    0 : author name (a string) 
    1 : average word length (float) 
    2 : TTR (float) 
    3 : Hapax Legomana Ratio (float) 
    4 : average sentence length (float) 
    5 : average sentence complexity (float) 
    weight is a list of multiplicative weights to apply to each 
    linguistic feature. weight[0] is ignored. 
    ''' 

    i=1 
    while i <=5: 
     result +=(abs(sig1[i]-sig2[i]))*weight[i] 
     i+=1 
    return result 


def read_signature(filename): 
    '''Read a linguistic signature from filename and return it as 
    list of features. ''' 

    file = open(filename, 'r') 
    # the first feature is a string so it doesn't need casting to float 
    result = [file.readline()] 
    # all remaining features are real numbers 
    for line in file: 
     result.append(float(line.strip())) 
    return result 


if __name__ == '__main__': 

    prompt = 'enter the name of the file with unknown author:' 
    mystery_filename = get_valid_filename(prompt) 

    # readlines gives us the file as a list of strings each ending in '\n' 
    text = open(mystery_filename, 'r').read() 
    text.close() 
    # calculate the signature for the mystery file 
    mystery_signature = [mystery_filename] 
    mystery_signature.append(average_word_length(text)) 
    mystery_signature.append(type_token_ratio(text)) 
    mystery_signature.append(hapax_legomana_ratio(text)) 
    mystery_signature.append(average_sentence_length(text)) 
    mystery_signature.append(avg_sentence_complexity(text)) 

    weights = [0, 11, 33, 50, 0.4, 4] 

    prompt = 'enter the path to the directory of signature files: ' 
    dir = read_directory_name(prompt) 
    # every file in this directory must be a linguistic signature 
    files=os.listdir(dir) 

    # to do: Get list of all files in directory name we just got. 
    # store it in a list called 'files' 

    # we will assume that there is at least one signature in that directory 
    this_file = files[0] 
    signature = read_signature('{} {}'.format(dir,this_file)) 
    best_score = compare_signatures(mystery_signature, signature, weights) 
    best_author = signature[0] 
    for this_file in files[1:]: 
     signature = read_signature('{} {}'.format(dir,this_file)) 
     score = compare_signatures(mystery_signature, signature, weights) 
     if score < best_score: 
      best_score = score 
      best_author = signature[0] 
    print("best author match: {} with score {}".format(best_author, best_score)) 

ответ

1

В следующей строке (появляются дважды), путь генерируют путем присоединения реж часть имени файла и часть с пространства.

signature = read_signature('{} {}'.format(dir,this_file)) 
#       ^

Чтобы присоединиться к ним, вы должны использовать os.sep.

signature = read_signature('{}{}{}'.format(dir, os.sep, this_file)) 

или более предпочтительно использовать os.path.join:

signature = read_signature(os.path.join(dir, this_file)) 
+0

Он по-прежнему дает сообщение об ошибке. – Nick

+0

Ошибка без исключения - FileNotFoundError, но я работаю в правильном каталоге, поэтому я не понимаю, почему он не открывается. – Nick

+1

@ Ник, Не могли бы вы показать трассировку? (Обновите вопрос) – falsetru

0

Вы уверены, что он сообщает "Этот файл не существует." а не «Этот каталог не существует».?

В read_directory_name(prompt) автор использует os.path.isfile(path), а документы для него: «Вернуть True, если путь является существующим обычным файлом. (...)».

Поскольку вы ищете каталог, который вам нужен os.path.isdir или os.path.exists.

Смежные вопросы