2014-01-16 2 views
0

Я просмотрел документацию sklearn и написал код для обучения классификатору SVM, а также его тестирования. Однако, на конечной стадии, я получаю ошибку, которую я не могу понять. Мой код, как показано ниже:Ошибка при тестировании классификатора SVM для классификации текста

rb = open_workbook('subjectcat.xlsx')#C:/Users/5460/Desktop/ 
wb = copy(rb) #making a copy 
sheet = rb.sheet_by_index(0) 

#only subjects extracted from excel file  
train_set =() #list 
for row_index in range(1,500): #train using 500 
    subject = 0 
    for col_index in range(1,2):   
     if col_index==1: 
      subject = sheet.cell(row_index,col_index).value 
      subject = "'" + subject 
      train_set = train_set + (subject,) 

print 'only subjects' 
train = list(train_set) 
print len(train_set) 
#for t in train_set: 
# print t 

vectorizer = TfidfVectorizer(min_df=1) #Tf-idf and CountVector 
#extracting features from training data 
#corpus = set(train_set) -- was reducing len to 468 
corpus = (train_set) 
print len(corpus) 
x = vectorizer.fit_transform(corpus) 
feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user 
#print feature_names 

x_array = x.toarray() 
print x_array 
print type(x_array) 
print len(x_array) 

#converting to numpy 2D array 
data_array = np.array(x_array) 
print type(data_array) 
print len(data_array) 
print data_array 

#only categories extracted from excel file  
cat_set =() #list 
for row_index in range(1,500): #train using 500 
    subject = 0 
    for col_index in range(2,4):   
     if col_index==3: 
      category = sheet.cell(row_index,col_index).value 
      #in numerical form 
      catgory = int(category) 
      cat_set = cat_set + (category,) 

#for c in cat_set: 
# print c 
print 'only categories' 
cat_set = list(cat_set) 
print len(cat_set) 
cat_array = np.array(cat_set) 
print cat_array 
print type(cat_array) 

################################################################# 

#data for testing 
#only subjects extracted from excel file  
test_set =() #list 
for row_index in range(500,575): #train using 500 
    subject = 0 
    for col_index in range(1,2):   
     if col_index==1: 
      subject = sheet.cell(row_index,col_index).value 
      subject = "'" + subject 
      test_set = test_set + (subject,) 

print 'only testing subjects' 
test = list(test_set) 
print len(test_set) 

#extracting features from testing data 
test_corpus = (test_set) 
print len(test_corpus) 
y = vectorizer.fit_transform(test_corpus) 
#feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user 

y_array = y.toarray() 
#converting to numpy 2D array 
test_array = np.array(y_array) 
print type(y_array) 
print len(y_array) 
print y_array 

################################################################ 

def svm_learning(x,y): 
    clf = svm.SVC() 
    clf.fit(x,y) 
    print 'classifier trained' 
    return clf #returning classifier 

def test_classifier(classifier): 
    for t in test_array: 
     result = classifier.predict(t) 
     print result 


classifier = svm_learning(data_array, cat_array) 
test_classifier(classifier) 

Он работает до конца, где я получаю ошибку, как показано ниже:

Traceback (most recent call last): 
    File "C:\Users\5460\Desktop\Code\0506_01.py", line 130, in <module> 
    test_classifier(classifier) 
    File "C:\Users\5460\Desktop\Code\0506_01.py", line 125, in test_classifier 
    result = classifier.predict(t) 
    File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 466, in predict 
    y = super(BaseSVC, self).predict(X) 
    File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 282, in predict 
    X = self._validate_for_predict(X) 
    File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 404, in _validate_for_predict 
    (n_features, self.shape_fit_[1])) 
ValueError: X.shape[1] = 315 should be equal to 1094, the number of features at training time 

Я приложил результат для Referece, как показано ниже:

only subjects 
499 
499 
[[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.42325613 0.  ] 
[ 0.   0.   0.   ..., 0.   0.42325613 0.  ] 
..., 
[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.   0.  ]] 
<type 'numpy.ndarray'> 
499 
<type 'numpy.ndarray'> 
499 
[[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.42325613 0.  ] 
[ 0.   0.   0.   ..., 0.   0.42325613 0.  ] 
..., 
[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.   0.  ] 
[ 0.   0.   0.   ..., 0.   0.   0.  ]] 
only categories 
499 
[ 1. 1. 1. 0. 1. 0. 1. 0. 2. 2. 3. 3. 0. 3. 0. 0. 4. 0. 
    0. 2. 3. 0. 0. 3. 0. 0. 3. 0. 0. 0. 1. 4. 1. 3. 0. 3. 
    0. 3. 2. 3. 0. 0. 3. 2. 4. 0. 3. 2. 3. 2. 3. 3. 0. 0. 
    0. 3. 0. 0. 0. 3. 0. 0. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0. 
    0. 0. 0. 4. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 0. 1. 0. 0. 
    0. 3. 0. 0. 0. 3. 3. 2. 0. 3. 0. 3. 3. 4. 1. 3. 3. 0. 
    3. 0. 0. 0. 0. 3. 3. 1. 0. 0. 3. 2. 0. 1. 0. 1. 1. 1. 
    1. 1. 2. 2. 2. 2. 2. 2. 0. 0. 0. 0. 0. 3. 3. 3. 3. 3. 
    0. 3. 3. 0. 3. 0. 3. 3. 0. 0. 0. 3. 3. 1. 3. 3. 3. 0. 
    0. 0. 3. 3. 3. 3. 0. 3. 3. 3. 3. 3. 3. 0. 0. 3. 3. 3. 
    3. 0. 0. 3. 3. 0. 3. 3. 3. 2. 3. 3. 3. 3. 3. 0. 0. 3. 
    3. 3. 3. 0. 3. 3. 3. 0. 3. 3. 4. 0. 3. 0. 0. 2. 3. 0. 
    0. 0. 4. 4. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 2. 
    4. 2. 2. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 1. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 
    0. 0. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 
    5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.] 
<type 'numpy.ndarray'> 
only testing subjects 
75 
75 
<type 'numpy.ndarray'> 
75 
[[ 0. 0. 0. ..., 0. 0. 0.] 
[ 0. 0. 0. ..., 0. 0. 0.] 
[ 0. 0. 0. ..., 0. 0. 0.] 
..., 
[ 0. 0. 0. ..., 0. 0. 0.] 
[ 0. 0. 0. ..., 0. 0. 0.] 
[ 0. 0. 0. ..., 0. 0. 0.]] 
classifier trained 

Любая помощь относительно ошибки будет действительно оценена. Я не уверен, чего не хватает, или идет не так. Заранее большое спасибо!

+0

Возможный дубликат [Как классифицировать документы с помощью SciKitLearn с использованием TfIdfVectorizer?] (Http://stackoverflow.com/questions/19671218/how-do-i-classify-documents-with-scikitlearn-using-tfidfvectorizer) –

ответ

2
y = vectorizer.fit_transform(test_corpus) 

переобучает в векторизатор, чтобы узнать словарь тестового корпуса, который отличается от учебного корпуса, так что вы получите различные функции. Используйте transform в тестовом наборе вместо fit_transform.

Смежные вопросы