Я просмотрел документацию sklearn и написал код для обучения классификатору SVM, а также его тестирования. Однако, на конечной стадии, я получаю ошибку, которую я не могу понять. Мой код, как показано ниже:Ошибка при тестировании классификатора SVM для классификации текста
rb = open_workbook('subjectcat.xlsx')#C:/Users/5460/Desktop/
wb = copy(rb) #making a copy
sheet = rb.sheet_by_index(0)
#only subjects extracted from excel file
train_set =() #list
for row_index in range(1,500): #train using 500
subject = 0
for col_index in range(1,2):
if col_index==1:
subject = sheet.cell(row_index,col_index).value
subject = "'" + subject
train_set = train_set + (subject,)
print 'only subjects'
train = list(train_set)
print len(train_set)
#for t in train_set:
# print t
vectorizer = TfidfVectorizer(min_df=1) #Tf-idf and CountVector
#extracting features from training data
#corpus = set(train_set) -- was reducing len to 468
corpus = (train_set)
print len(corpus)
x = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user
#print feature_names
x_array = x.toarray()
print x_array
print type(x_array)
print len(x_array)
#converting to numpy 2D array
data_array = np.array(x_array)
print type(data_array)
print len(data_array)
print data_array
#only categories extracted from excel file
cat_set =() #list
for row_index in range(1,500): #train using 500
subject = 0
for col_index in range(2,4):
if col_index==3:
category = sheet.cell(row_index,col_index).value
#in numerical form
catgory = int(category)
cat_set = cat_set + (category,)
#for c in cat_set:
# print c
print 'only categories'
cat_set = list(cat_set)
print len(cat_set)
cat_array = np.array(cat_set)
print cat_array
print type(cat_array)
#################################################################
#data for testing
#only subjects extracted from excel file
test_set =() #list
for row_index in range(500,575): #train using 500
subject = 0
for col_index in range(1,2):
if col_index==1:
subject = sheet.cell(row_index,col_index).value
subject = "'" + subject
test_set = test_set + (subject,)
print 'only testing subjects'
test = list(test_set)
print len(test_set)
#extracting features from testing data
test_corpus = (test_set)
print len(test_corpus)
y = vectorizer.fit_transform(test_corpus)
#feature_names = vectorizer.get_feature_names() #use this for toarray() later -- this is to interpret for user
y_array = y.toarray()
#converting to numpy 2D array
test_array = np.array(y_array)
print type(y_array)
print len(y_array)
print y_array
################################################################
def svm_learning(x,y):
clf = svm.SVC()
clf.fit(x,y)
print 'classifier trained'
return clf #returning classifier
def test_classifier(classifier):
for t in test_array:
result = classifier.predict(t)
print result
classifier = svm_learning(data_array, cat_array)
test_classifier(classifier)
Он работает до конца, где я получаю ошибку, как показано ниже:
Traceback (most recent call last):
File "C:\Users\5460\Desktop\Code\0506_01.py", line 130, in <module>
test_classifier(classifier)
File "C:\Users\5460\Desktop\Code\0506_01.py", line 125, in test_classifier
result = classifier.predict(t)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 466, in predict
y = super(BaseSVC, self).predict(X)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 282, in predict
X = self._validate_for_predict(X)
File "C:\Python27\lib\site-packages\sklearn\svm\base.py", line 404, in _validate_for_predict
(n_features, self.shape_fit_[1]))
ValueError: X.shape[1] = 315 should be equal to 1094, the number of features at training time
Я приложил результат для Referece, как показано ниже:
only subjects
499
499
[[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
...,
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]]
<type 'numpy.ndarray'>
499
<type 'numpy.ndarray'>
499
[[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
[ 0. 0. 0. ..., 0. 0.42325613 0. ]
...,
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]
[ 0. 0. 0. ..., 0. 0. 0. ]]
only categories
499
[ 1. 1. 1. 0. 1. 0. 1. 0. 2. 2. 3. 3. 0. 3. 0. 0. 4. 0.
0. 2. 3. 0. 0. 3. 0. 0. 3. 0. 0. 0. 1. 4. 1. 3. 0. 3.
0. 3. 2. 3. 0. 0. 3. 2. 4. 0. 3. 2. 3. 2. 3. 3. 0. 0.
0. 3. 0. 0. 0. 3. 0. 0. 2. 0. 0. 0. 0. 0. 2. 0. 0. 0.
0. 0. 0. 4. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 0. 1. 0. 0.
0. 3. 0. 0. 0. 3. 3. 2. 0. 3. 0. 3. 3. 4. 1. 3. 3. 0.
3. 0. 0. 0. 0. 3. 3. 1. 0. 0. 3. 2. 0. 1. 0. 1. 1. 1.
1. 1. 2. 2. 2. 2. 2. 2. 0. 0. 0. 0. 0. 3. 3. 3. 3. 3.
0. 3. 3. 0. 3. 0. 3. 3. 0. 0. 0. 3. 3. 1. 3. 3. 3. 0.
0. 0. 3. 3. 3. 3. 0. 3. 3. 3. 3. 3. 3. 0. 0. 3. 3. 3.
3. 0. 0. 3. 3. 0. 3. 3. 3. 2. 3. 3. 3. 3. 3. 0. 0. 3.
3. 3. 3. 0. 3. 3. 3. 0. 3. 3. 4. 0. 3. 0. 0. 2. 3. 0.
0. 0. 4. 4. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 2. 2.
4. 2. 2. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 2. 2. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.
5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
<type 'numpy.ndarray'>
only testing subjects
75
75
<type 'numpy.ndarray'>
75
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
...,
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
classifier trained
Любая помощь относительно ошибки будет действительно оценена. Я не уверен, чего не хватает, или идет не так. Заранее большое спасибо!
Возможный дубликат [Как классифицировать документы с помощью SciKitLearn с использованием TfIdfVectorizer?] (Http://stackoverflow.com/questions/19671218/how-do-i-classify-documents-with-scikitlearn-using-tfidfvectorizer) –