Я пытаюсь очистить URL с помощью BeautifulSoup/Requests, а затем очистить его, вытащив только те разделы, которые мне нужны. После того, как вы решили использовать другой целевой URL, он выводит HTML правильно, но мой код для очистки не работает. Вот мой код:Python Скребок/Парсинг с BeautifulSoup
import requests
from bs4 import BeautifulSoup
import bs4.element
import pprint
def connection(url):
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5)'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text)
return soup
def scrape_metacritic(soup,movie_list=[]):
for mlist in get_modules(soup).items():
for movie in mlist:
try:
m = parse_movie_li(movie)
except:
continue
#m['release_type']=release_type
movie_list.append(m)
return movie_list
def just_tags(templist):
tags = [t for t in templist if isinstance(t,bs4.element.Tag)]
return tags
def get_modules(soup):
module = soup.find(class_='body_wrap') #body_wrap
module_dict = {}
for mod in module.find_all('li', class_='product'):
movie_lis = just_tags(mod.find('ul').contents)
module_dict[mod]=movie_lis
return module_dict
get_modules(soup)
Эта часть работает. Вот URL:
url = 'http://www.metacritic.com/browse/movies/title/dvd/a?view=detailed'
soup = connection(url)
Это часть того, что я получаю после соскоба:
{<li class="product limited_release_product has_small_image"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-birders-guide-to-everything">A Birder's Guide to Everything</a></h3></div><a class="basic_stat product_score" href="/movie/a-birders-guide-to-everything">
<span class="metascore_w medium movie positive">61</span>
</a></div> <div class="more_stats extended_stats">
<ul class="more_stats">
<li class="stat release_date">
<span class="label">Release Date:</span>
<span class="data">March 21, 2014</span>
</li>
<li class="stat rating">
<span class="label">Rated:</span>
<span class="data">
..
<span class="data">136 min</span>
</li>]}
Теперь я стараюсь, чтобы очистить его с этим:
from dateutil import parser
def parse_movie_li(li):
title_div = li.find(class_='product_title')
movie = {
'title':title_div.text.strip(),
'rel_url':title_div.find('a')['href'],
'release_date':get_release_date(li.find(class_='release_date').find(class_='data')),
'metascore_w':get_metascore_w(li.find(class_='metascore_w')),
'user_score':get_user_score(li.find(class_='product avg_userscore').find(class_='data')), #add func
'genre':get_genre(li.find(class_='genre').find(class_='data')), #add func
'star_cast':get_star_cast(li.find(class_='cast').find(class_='data')), #add func
'runtime':get_runtime(li.find(class_='runtime').find(class_='data')) #add func
}
#print movie,'\n'
return movie
def get_metascore_w(div):
try:
score = div.text
except:
print 'no text in metascore div'
return None
try:
score = int(score)
except:
pass
return score
def get_release_date(div):
try:
datestr = div.text
except:
return None
try:
date = parser.parse(datestr)
except:
return datestr
return date
def get_user_score(div):
try:
uscore = div.text
except:
print 'no text in userscore div'
return None
try:
uscore = int(uscore)
except:
pass
return uscore
def get_genre(div):
try:
genre = div.text
except:
print 'no text in genre div'
return None
try:
genre
except:
pass
return score
def get_star_cast(div):
try:
cast = div.text
except:
print 'no text in cast div'
return None
try:
cast
except:
pass
return cast
def get_runtime(div):
try:
runtime = div.text.strip(' min')
except:
print 'no text in runtime div'
return None
try:
runtime = int(runtime)
except:
pass
return runtime
Он должен должны быть выданы в таком виде:
[{'metascore_w': 28,
'rel_url': '/movie/mortdecai',
'release_date': datetime.datetime(2015, 1, 23, 0, 0),
'release_type': u'Wide releases now in theaters',
'title': u'Mortdecai'},
{'metascore_w': 24,
'rel_url': '/movie/strange-magic',
'release_date': datetime.datetime(2015, 1, 23, 0, 0),
'release_type': u'Wide releases now in theaters',
'title': u'Strange Magic'},
..
{'metascore_w': u'tbd',
'rel_url': '/movie/20-once-again',
'release_date': datetime.datetime(2015, 1, 16, 0, 0),
'release_type': u'Limited releases now in theaters',
'title': u'20 Once Again'}]
Howeve r, я получаю это:
{<li class="product limited_release_product has_small_image alt"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-family-thing">A Family Thing</a></h3></div><a class="basic_stat product_score" href="/movie/a-family-thing">
<span class="metascore_w medium movie positive">71</span>
</a></div> <div class="more_stats extended_stats">
<ul class="more_stats">
<li class="stat release_date">
<span class="label">Release Date:</span>
<span class="data">March 29, 1996</span>
</li>..
Это непроверено. Любые указания относительно того, что я делаю неправильно с функцией parse_movie_li?
Что такое 'just_tags()'? – alecxe
Забыл добавить еще 2 функции .. thx @alexce – DNburtonguster