Я новичок в scrapy, и я пытаюсь получить публикации этих страниц. Первые 3 функции работы (публикации, хранятся в базе данных), но мне кажется, что четвертый не работает (parse_web3, я делаю неправильноScrapy не входит в парсию
class crawler(scrapy.Spider):
name = "crawler"
start_urls = ["http://www.sac.org.ar/argentine-cardiology-journal-archive",
"http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions",
"http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR",
"http://www.intechopen.com/books"]
def parse(self, response):
for href in response.css("a::attr('href')"):
print href
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_web0)
url2 = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
yield scrapy.Request(url2,callback=self.parse_web1)
url3 = "http://road.issn.org/issn_search?afs:query=&show-adv=0&afs:replies=100#.VqaLtl4oDtR"
yield scrapy.Request(url3,callback=self.parse_web2)
url4 = "http://www.intechopen.com/books"
yield scrapy.Request(url4,callback=self.parse_web3)
def parse_web0(self, response):
i=0
for sel in response.xpath("//ul[@class='d3s-revista']"):
publicaciones = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-post']/text()").extract()
autores = sel.xpath("//li[contains(p, 'Scientific Letters')]/p[@class='d3s-titulo-autores']/text()").extract()
links = sel.xpath("////li[contains(p, 'Scientific Letters')]/a[contains(@href,'.pdf')]/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = response.xpath("//p[@class='d3s-titulo-numero']/text()").re(r'\d\d\d\d')[0].strip()
publicacion['isbn'] = response.xpath("//div[@id='d3s-page-content']/div/div/div/text()").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = links[o]
yield publicacion
o+=1
i+=1 #este funciona
#url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions"
#yield scrapy.Request(url,callback=self.parse_web1)
def parse_web1(self, response):
i=0
for sel in response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']"):
publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd' or @class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
autores = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='author']/span/text()").extract()
links = sel.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-title']/a/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("//div[@id='ds-main']/div[@id='ds-content-wrapper']/div[@id='ds-content']/div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item odd']/div[@class='artifact-description']/div[@class='artifact-info']/span[@class='publisher-date']/span[@class='date']/text()").re(r'\d\d\d\d')[0].strip())
publicacion['isbn'] = "ISBN dentro del PDF"
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = "http://rinfi.fi.mdp.edu.ar"+ links[o]
yield publicacion
o+=1
i+=1
#url = "http://road.issn.org/issn_search?afs%3Aquery=ciencia#.VqZ3v14oDtQ"
#yield scrapy.Request(url,callback=self.parse_web2)
#url = "http://rinfi.fi.mdp.edu.ar/xmlui/recent-submissions?offset=20"
#yield scrapy.Request(url,callback=self.parse_web2)
def parse_web2(self, response):
i=0
for sel in response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']"):
publicaciones = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/text()").extract()
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
links = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-title']/a/@href").extract()
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-registration_year']").re(r'\d\d\d\d')[0].strip())
publicacion['isbn'] = response.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-issn']").re(r'\d\d\d\d-\d\d\d\d')[0].strip()
publicacion['nombre_autor'] = autores[o]
publicacion['url_link'] = links[o]
yield publicacion
o+=1
i+=1
def parse_web3(self, response): #Books
i=0
for sel in response.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div"):
publicaciones = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/text()").extract() #publicacion
#publicaciones = sel.xpath("//div[@id='ds-body']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_main-recent-submissions']/div[@id='aspect_discovery_recentSubmissions_RecentSubmissionTransformer_div_recent-submissions']/ul[@class='ds-artifact-list']/li[@class='ds-artifact-item even']/div[@class='artifact-description']/div[@class='artifact-title']/a/text()").extract()
#autores = sel.xpath("//div[@class='page-container']/div[@class='page']/div[@id='main-content']/div[@class='main-content-inside']/div[@class='region-content']/div[@class='issn-search']/div[@class='search-results']/div[@class='search-result type-journals']/div[@class='search-result-publisher']/text()").extract()
links = sel.xpath("//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dt/a/@href").extract() #links
if i == 0:
o=0
while o != len(publicaciones):
publicacion = PublicacionItem()
publicacion['titulo_publicacion'] = publicaciones[o]
publicacion['anio_publicacion'] = (response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 2], ', ')").re(r'\d\d\d\d')[0].strip()) #Fecha, ultimos cuatro digitos.
publicacion['isbn'] = response.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 1],'ISBN')").re(r'\d\d\d-\d\d\d-\d\d-\d\d\d\d-\d')[0].strip() #ISBN
publicacion['nombre_autor'] = sel.xpath("substring-after(//div[@id='sizer']/div[@id='content']/div[@class='grid']/div[@class='main-content']/div[@id='tc']/div/ul[@class='book-listing entity-listing']/li/dl/dd[@class='meta']/text()[count(preceding-sibling::br) = 0],'Editor')").extract() #Autor
publicacion['url_link'] = links[o]
print(publicacion['titulo_publicacion'])
raw_input()
yield publicacion
o+=1
i+=1
PS: извините за мой английский, я из Аргентина
Вы можете исправить свой отступ? – eLRuLL
Откуда вы знаете, что он не входит в 'parse_web3'? попробуйте распечатать что-то внутри 'parse_web3', чтобы проверить, работает ли он – eLRuLL
Я поставил печать, но ничего не показываю. Я экспериментировал с другими функциями, и они показывают печать правильно. –