Мой код отлично работает. Однако с некоторыми данными мой код дает ошибку. проблематичные данные: T 10 в этом месяце. В честь юбилея и предстоящего T @ 10 Issue, эта серия оглядывается на некоторые из самых запоминающихся историй из журналов в первом десятилетии.Ошибка при использовании Beautiful Soup в Python
сообщенная проблема является
Traceback (most recent call last): File "/Users/mas/Documents/workspace/DeepLearning/BagOfWords.py", line 41, in clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True))) File "/Users/mas/Documents/workspace/DeepLearning/KaggleWord2VecUtility.py", line 22, in review_to_wordlist review_text = BeautifulSoup(review).get_text() File "/Library/Python/2.7/site-packages/bs4/init.py", line 162, in init elif len(markup) <= 256: TypeError: object of type 'float' has no len()
код:
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name))
value = kwargs[old_name]
del kwargs[old_name]
return value
return None
parse_only = parse_only or deprecated_argument(
"parseOnlyThese", "parse_only")
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
raise FeatureNotFound(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256:
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if (isinstance(markup, unicode)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
possible_filename = markup
is_file = False
try:
is_file = os.path.exists(possible_filename)
except Exception, e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
# TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)):
self.reset()
try:
self._feed()
break
except ParserRejectedMarkup:
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
это мой основной код:
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
if __name__ == '__main__':
train = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTrain.csv'), header=0)
test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'NYTimesBlogTest.csv'), header=0)
print 'A sample Abstract is:'
print train["Abstract"][2838]
print 'A sample Snippet is:'
print train["Snippet"][2838]
#raw_input("Press Enter to continue...")
#print 'Download text data sets. If you already have NLTK datasets downloaded, just close the Python download window...'
#nltk.download() # Download text data sets, including stop words
# Initialize an empty list to hold the clean reviews
clean_train_reviews = []
# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list
print len(train["Snippet"])
print "Cleaning and parsing the training set abstracts...\n"
for i in xrange(0, 3000):
clean_train_reviews.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["Snippet"][i], True)))
if not train["Snippet"][i]:
print i
#
это ценность моего отзыва: T будет 10 в этом месяце. В честь юбилея и предстоящего T @ 10 Issue, эта серия оглядывается на некоторые из самых запоминающихся историй из журналов в первом десятилетии. @randalv – MAS
Какая версия Beautiful Soup? get_text не разрешил для меня с 3.2.1, и я не могу воспроизвести с помощью bs4 (4.3.2), если я передам обзор как строку с таким точным значением. – randalv
Я использую bs4 @randalv – MAS