Я создал PoC, чтобы помочь вам понять, как вы могли бы сделать это с запросов библиотеки.
Этот скрипт только потертости:
название и ссылка каждой новости/статьи в каждой странице результатов поиска по предоставленному ключевому слову (ы)
Вы можете адаптировать код для очистки конкретных данных, которые вас интересуют.
В коде есть комментарии, поэтому я не буду объяснять слишком много вне кода. Тем не менее, если у вас есть дополнительные вопросы, просто дайте мне знать.
from lxml import html
from requests import Session
## Setting some vars
LOGIN_URL = "http://infotrac.galegroup.com/default/palm83799?db=SP19"
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
## Payload for LOGIN_URL page
payload = {
'db':'SP19',
'locpword':'25913000925235',
'proceed':'Authenticate',
}
## Headers to be set for every request with our requests.Session()
headers = {
'User-Agent':USER_AGENT
}
## requests.Session insance
s = Session()
## Updating/setting headers to be used in every request within our Session()
s.headers.update(headers)
## Making first request to our LOGIN_URL page to get Cookies and Sessions we will need later
s.get(LOGIN_URL)
def extractTitlesAndLinksFromPaginatePageResponse(response, page):
## Creating a dictionary with the following structure
## {
## page: { ## this value is the page number
## "news": None # right now we leave it as None until we have all the news (dict), from this page, scraped
## }
## }
##
## e.g.
##
## {
## 1: {
## "news": None # right now we leave it as None until we have all the news (dict), from this page, scraped
## }
## }
##
news = {page: dict(news=None)}
## count = The result's number. e.g. The first result from this page will be 1, the second result will be 2, and so on until 20.
count = 1
## Parsing the HTML from response.content
tree = html.fromstring(response.content)
## Creating a dictionary with the following structure
## {
## count: { ## count will be the result number for the current page
## "title": "Here goes the news title",
## "link": "Here goes the news link",
## }
## }
##
## e.g.
##
## {
## 1: {
## "title": "Drought swept aside; End-of-angst story? This is much more.",
## "link": "http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=1921&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA138024966&contentSet=GALE%7CA138024966",
## },
## 2: {
## "title": "The Fast Life.",
## "link": "http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=1922&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA137929858&contentSet=GALE%7CA137929858",
## },
## ...and so on...
## }
tmp_dict = dict()
## Applying some xPATHs to extract every result from the current page
## Adding "http://go.galegroup.com/ps/" prefix to every result's link we extract
## Adding results to tmp_dict
## Count increment +1
for result in tree.xpath('//li[@class="citation-view"]'):
link, title = result.xpath('.//div[@class="titleWrapper"]/span[@class="title"]/a/@href | .//div[@class="titleWrapper"]/span[@class="title"]/a/text()')
link = "{}{}".format("http://go.galegroup.com/ps/", link)
tmp_dict[count] = dict(title=title, link=link)
count += 1
## Asigning tmp_dict as value of news[page]["news"]
news[page]["news"] = tmp_dict
## Returning news dictionary with all of the results from the current page
return news
def searchKeyWord(search_string):
## Creating a dictionary with the following structure
## {
## "keyword": search_string, ## in this case 'search_string' is "byline john romano"
## "pages": None ## right now we leave it as None until we have all the pages scraped
## }
full_news = dict(keyword=search_string, pages=None)
## This will be a temporary dictionary which will contain all the pages and news inside. This is the dict that will be the value of full_news["pages"]
tmp_dict = dict()
## Replacing spaces with 'plus' sign to match the website's behavior
search_string = search_string.replace(' ', '+')
## URL of the first page for every search request
search_url = "http://go.galegroup.com/ps/basicSearch.do?inputFieldValue(0)={}&inputFieldName(0)=OQE&inputFieldName(0)=OQE&nwf=y&searchType=BasicSearchForm&userGroupName=palm83799&prodId=SPJ.SP19&method=doSearch&dblist=&standAloneLimiters=LI".format(search_string)
##
## count = Number of the page we are currently scraping
## response_code = The response code we should match against every request we make to the pagination endpoint. Once it returns a 500 response code, it means we have reached the last page
## currentPosition = It's like an offset var, which contains the value of the next results to be rendered. We will increment its value in 20 for each page we request.
##
count = 1 ## Don't change this value. It should always be 1.
response_code = 200 ## Don't change this value. It should always be 200.
currentPosition = 21 ## Don't change this value. It should always be 21.
## Making a GET request to the search_url (first results page)
first_page_response = s.get(search_url)
## Calling extractTitlesAndLinksFromPaginatePageResponse() with the response and count (number of the page we are currently scraping)
first_page_news = extractTitlesAndLinksFromPaginatePageResponse(first_page_response, count)
## Updating our tmp_dict with the dict of news returned by extractTitlesAndLinksFromPaginatePageResponse()
tmp_dict.update(first_page_news)
## If response code of last pagination request is not 200 we stop looping
while response_code == 200:
count += 1
paginate_url = "http://go.galegroup.com/ps/paginate.do?currentPosition={}&inPS=true&prodId=SPJ.SP19&searchId=R1&searchResultsType=SingleTab&searchType=BasicSearchForm&sort=DA-SORT&tabID=T004&userGroupName=palm83799".format(currentPosition)
## Making a request to the next paginate page with special headers to make sure our script follows the website's behavior
next_pages_response = s.get(paginate_url, headers={'X-Requested-With':'XMLHttpRequest', 'Referer':search_url})
## Updating response code to be checked before making the next paginate request
response_code = next_pages_response.status_code
## Calling extractTitlesAndLinksFromPaginatePageResponse() with the response and count (number of the page we are currently scraping)
pagination_news = extractTitlesAndLinksFromPaginatePageResponse(next_pages_response, count)
## Updating dict with pagination's current page results
tmp_dict.update(pagination_news)
## Updating our offset/position
currentPosition += 20
## Deleting results from 500 response code
del tmp_dict[count]
## When the while loop has finished making requests and extracting results from every page
## Pages dictionary, with all the pages and their corresponding results/news, becomes a value of full_news["pages"]
full_news["pages"] = tmp_dict
return full_news
## This is the POST request to LOGIN_URL with our payload data and some extra headers to make sure everything works as expected
login_response = s.post(LOGIN_URL, data=payload, headers={'Referer':'http://infotrac.galegroup.com/default/palm83799?db=SP19', 'Content-Type':'application/x-www-form-urlencoded'})
## Once we are logged in and our Session has all the website's cookies and sessions
## We call searchKeyWord() function with the text/keywords we want to search for
## Results will be stored in all_the_news var
all_the_news = searchKeyWord("byline john romano")
## Finally you can
print all_the_news
## Or do whatever you need to do. Like for example, loop all_the_news dictionary to make requests to every news url and scrape the data you are interested in.
## You can also adjust the script (add one more function) to scrape every news detail page data, and call it from inside of extractTitlesAndLinksFromPaginatePageResponse()
Это будет что-то вроде этого: (это просто пример вывода, чтобы избежать вставляя слишком много данных):
{
'pages': {
1: {
'news': {
1: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=1&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA433496708&contentSet=GALE%7CA433496708',
'title': 'ANGER AT DECISIONS BUT APATHY AT POLLS.'
},
2: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=2&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA433399216&contentSet=GALE%7CA433399216',
'title': 'SMART GUN TECHNOLOGY STARTING TO MAKE SENSE.'
},
3: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=3&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA433029222&contentSet=GALE%7CA433029222',
'title': 'OF COURSE, FIRE S.C. DEPUTY, BUT MAYBE ...'
},
4: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=4&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA432820751&contentSet=GALE%7CA432820751',
'title': 'SCHOOL REFORMS MISS REAL PROBLEM.'
},
5: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=5&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA432699330&contentSet=GALE%7CA432699330',
'title': 'TENSION IS UNNECESSARILY THICK AT CITY HALL.'
},
6: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=6&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA432285591&contentSet=GALE%7CA432285591',
'title': 'OPT OUT MOVEMENT ON TESTING GETS NOTICE.'
},
7: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=7&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA432088310&contentSet=GALE%7CA432088310',
'title': 'CREDIT CITY COUNCIL FOR OPTIONS ON RAYS DEAL.'
},
8: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=8&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA431979679&contentSet=GALE%7CA431979679',
'title': 'FLORIDA CAN PLAY IT SMART ON MARIJUANA.'
},
9: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Article&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=9&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA432008411&contentSet=GALE%7CA432008411',
'title': 'A PLAY-BY-PLAY LOOK AT LIFE, THE RAYS.'
},
10: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=10&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA431632768&contentSet=GALE%7CA431632768',
'title': 'QUALITY LACKING AS FLORIDA ADDS JOBS.'
},
11: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=11&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA431451912&contentSet=GALE%7CA431451912',
'title': 'INSTEAD OF EMPATHY, JUDGE ADDS TO ABUSE.'
},
12: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=12&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA431359125&contentSet=GALE%7CA431359125',
'title': 'HE WANTS TO CONTROL HIS DEATH, HIS WAY.'
},
13: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=13&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430976221&contentSet=GALE%7CA430976221',
'title': "POLITICAL PARTY'S RISE RAVAGED BY 'CRACKPOT'."
},
14: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=14&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430813416&contentSet=GALE%7CA430813416',
'title': "STADIUM TALKS VS. HISTORY'S CURVEBALLS."
},
15: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=15&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430729230&contentSet=GALE%7CA430729230',
'title': 'OVERHAUL BUSH-ERA EDUCATION REFORMS.'
},
16: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=16&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430430295&contentSet=GALE%7CA430430295',
'title': 'BEWARE OF EXTRA FEES FOR CAR TAG RENEWALS.'
},
17: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=17&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430179746&contentSet=GALE%7CA430179746',
'title': 'STATE FAILS SICK KIDS, THEN FIGHTS CHANGES.'
},
18: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=18&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA430104409&contentSet=GALE%7CA430104409',
'title': 'HOW A BIG CHANGED THE LIFE OF A LITTLE.'
},
19: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=19&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA429647686&contentSet=GALE%7CA429647686',
'title': 'PARK PLAN PUTS HEAT ON RAYS DECISION.'
},
20: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=20&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA429444602&contentSet=GALE%7CA429444602',
'title': 'SCOTT WILL TAKE CREDIT, BUT DODGES THE BURDEN.'
}
}
},
2: {
'news': {
1: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=21&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA428920357&contentSet=GALE%7CA428920357',
'title': 'HARD LINE ON POOR WORSE THAN OFFENSES.'
},
2: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=22&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA428643272&contentSet=GALE%7CA428643272',
'title': "DON'T RUN THE GRAND PRIX OUT OF TOWN."
},
3: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=23&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA428565070&contentSet=GALE%7CA428565070',
'title': "PUT JEB'S EDUCATION REFORMS TO THE TEST."
},
4: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=24&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA428196500&contentSet=GALE%7CA428196500',
'title': 'SINCERE APOLOGY IS A THING OF THE PAST.'
},
5: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=25&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA427980323&contentSet=GALE%7CA427980323',
'title': 'MISTRUST OF LEADERS DAMAGES EDUCATION.'
},
6: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=26&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA428127291&contentSet=GALE%7CA428127291',
'title': "ONLY ONE REMEDY FOR CLERK'S CONFLICT."
},
7: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=27&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA427578446&contentSet=GALE%7CA427578446',
'title': 'LOCAL POT LAWS COULD EASE RIGID PENALTIES.'
},
8: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=28&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA427324906&contentSet=GALE%7CA427324906',
'title': "UTILITIES' PLAN KEEPS CONSUMERS IN THE DARK."
},
9: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=29&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA427220594&contentSet=GALE%7CA427220594',
'title': 'JUDGE CONQUERS RETIREMENT WITH VERVE.'
},
10: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=30&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA426790479&contentSet=GALE%7CA426790479',
'title': 'APOLOGIES WOULD HELP IN SCHOOLS DISCUSSION.'
},
11: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=31&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA426560152&contentSet=GALE%7CA426560152',
'title': "PARENTS DON'T BACK BUSH'S TEST EMPHASIS."
},
12: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=32&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA426493640&contentSet=GALE%7CA426493640',
'title': 'POLITICALLY SPEAKING, THIS YEAR IS PATHETIC.'
},
13: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=33&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA426051781&contentSet=GALE%7CA426051781',
'title': "BLAMING PARENTS WON'T HELP CHILDREN."
},
14: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=34&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA425831366&contentSet=GALE%7CA425831366',
'title': "ON FAILING SCHOOLS, IT'S TIME FOR ACTION."
},
15: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=35&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA425724018&contentSet=GALE%7CA425724018',
'title': "SORRY? OUR LEGISLATORS DON'T KNOW THE WORD."
},
16: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=36&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA425256127&contentSet=GALE%7CA425256127',
'title': 'IN CLOSET, ESSENTIALS FOR MAKING LIVES BETTER.'
},
17: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=37&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA425006012&contentSet=GALE%7CA425006012',
'title': 'MEET IN MIDDLE ON TAXI, UBER REGULATION.'
},
18: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=38&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA424917550&contentSet=GALE%7CA424917550',
'title': "A STUNNING LOSS; The Tarpon Springs man who umpired the baseball game where a bat boy was killed is struggling to cope with the 9-year-old's death."
},
19: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=39&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA422480556&contentSet=GALE%7CA422480556',
'title': 'RAYS HAVE LOTS OF FANS, JUST NOT AT THE TROP.'
},
20: {
'link': 'http://go.galegroup.com/ps/retrieve.do?sort=DA-SORT&docType=Column&tabID=T004&prodId=SPJ.SP19&searchId=R1&resultListType=RESULT_LIST&searchType=BasicSearchForm&contentSegment=¤tPosition=40&searchResultsType=SingleTab&inPS=true&userGroupName=palm83799&docId=GALE%7CA422342622&contentSet=GALE%7CA422342622',
'title': 'TRY AGAIN WHEN IT COMES TO RECYCLING.'
}
}
},
}
'keyword': 'byline john romano'
}
И, наконец, как это было предложено в комментариях, вы можете:
- Loop all_the_news словаря, чтобы сделать запросы к каждым новостям URL и очистить данные, которые вы заинтересованы.
- Настройте сценарий (добавить еще одну функцию), чтобы очистить каждый Новость подробно страница данные, и называют его изнутри extractTitlesAndLinksFromPaginatePageResponse()
Я надеюсь, что это поможет вам лучше понять, как делает запросы библиотека работает.
Какие конкретные данные вам нужны из каждой статьи/новостей? –