Я получаю индекс индекса вне диапазона, и не знаю, почему. Мой код - это webscraper для сбора данных о температуре с веб-сайта. Все работало отлично в течение нескольких месяцев, до недавнего времени.Вне диапазона BeautifulSoup
У меня есть ряд функций, показанных ниже в качестве ссылки. Важным является getDailyAve()
, в котором я получаю исключение.
Любые мысли или советы приветствуются.
import sys
import urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen, URLError
import webbrowser
import time
from collections import Counter
import numpy as np
import re
import csv
import datetime
from datetime import timedelta
DATE_FORMAT = '%Y/%m/%d'
def daterange(start, end):
def convert(date):
try:
date = datetime.datetime.strptime(date, DATE_FORMAT)
return date.date()
except TypeError:
return date
def get_date(n):
return datetime.datetime.strftime(convert(start) + timedelta(days=n), DATE_FORMAT)
days = (convert(end) - convert(start)).days
if days <= 0:
raise ValueError('The start date must be before the end date.')
for n in range(0, days):
yield get_date(n)
class SiteLocation:
"""class defining mine location parameters to lookup on weather search"""
def __init__(self, city, state, zip, code):
self.city = city
self.state = state
self.zip = zip
self.code = code
def getDailyAve(url):
url = urllib.urlopen(url)
soup = BeautifulSoup(url.read(), 'lxml')
form = soup.find("form",{"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
time=[]
temp=[]
minutes=[]
# handle no data case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'" % url)
return None
# capture time and temps
for row in rows:
data = [td.text for td in row.find_all("td")]
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
if match:
temp.append(match.group())
time.append(data[0])
minutes.append(data[0][-4:-2])
common = Counter(minutes).most_common()[0][0]
finalTimes = []
finalTemps = []
for i in range(0,len(time)):
if minutes[i] == common:
finalTimes.append(time[i])
finalTemps.append(int(temp[i]))
dailyAve = sum(finalTemps)/float(len(finalTimes))
return dailyAve
def writeToCsv(list1, list2, list3, list4, list5, list6, list7, list8):
with open('results.csv', 'wb') as csvfile:
results = csv.writer(csvfile, delimiter=',')
results.writerow(['T-SJ', 'T- RB', 'T-DS', 'T-JW', 'T-GB', 'D', 'M', 'Y'])
for idx in range(0,len(list1)):
results.writerow([str(list1[idx]), str(list2[idx]), str(list3[idx]), str(list4[idx]), str(list5[idx]), str(list6[idx]), str(list7[idx]), str(list8[idx])])
def buildURL(location, day, month, year):
if day < 10:
strDay = '0'+str(day)
else:
strDay = str(day)
baseURL = "http://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=" + location.zip + "&pands=" + location.city + "%2" + "C" + location.state + "&place=" + location.city + "&state=" + location.state + "&icao=" + location.code + "&country=us&month=" + str(month) + "&day=" + strDay + "&year=" + str(year) + "&dosubmit=Go"
return baseURL
def main():
loc1 = SiteLocation('Farmington','NM','87401','KFMN')
loc2 = SiteLocation('Whitesville','WV','25209','KBKW')
loc3 = SiteLocation('Rangely','CO','81648','KVEL')
loc4 = SiteLocation('Brookwood','AL','35444','KTCL')
loc5 = SiteLocation('Princeton','IN','47670','KAJG')
start = '2016/08/31'
end = datetime.date.today()
dateRange = list(daterange(start, end))
listDailyAve1 = []
listDailyAve2 = []
listDailyAve3 = []
listDailyAve4 = []
listDailyAve5 = []
listDays = []
listMonths = []
listYears = []
for idx in range(0,len(dateRange)):
strDate = str(dateRange[idx]).split("/")
year = strDate[0]
month = strDate[1]
day = strDate[2]
url1 = buildURL(loc1, day, month, year)
url2 = buildURL(loc2, day, month, year)
url3 = buildURL(loc3, day, month, year)
url4 = buildURL(loc4, day, month, year)
url5 = buildURL(loc5, day, month, year)
dailyAve1 = getDailyAve(url1)
dailyAve2 = getDailyAve(url2)
dailyAve3 = getDailyAve(url3)
dailyAve4 = getDailyAve(url4)
dailyAve5 = getDailyAve(url5)
listDailyAve1.append(dailyAve1)
listDailyAve2.append(dailyAve2)
listDailyAve3.append(dailyAve3)
listDailyAve4.append(dailyAve4)
listDailyAve5.append(dailyAve5)
listDays.append(day)
listMonths.append(month)
listYears.append(year)
writeToCsv(listDailyAve1, listDailyAve2, listDailyAve3, listDailyAve4,listDailyAve5, listDays, listMonths, listYears)
if __name__ == '__main__':
status = main()
sys.exit(status)
Вот исключение брошено:
Traceback (most recent call last):
File ".\weatherScrape2.py", line 147, in <module>
status = main()
File ".\weatherScrape2.py", line 128, in main
dailyAve1 = getDailyAve(url1)
File ".\weatherScrape2.py", line 61, in getDailyAve
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
IndexError: list index out of range
Возможно ли температура упала ниже нуля? В Бельгии они сделали :) –
Не могли бы вы предоставить stacktrace, где произошла ошибка? Кроме того, ваш код скорее подвержен ошибкам: это не очень хороший дизайн (извините). –
@WillemVanOnsem Мой код должен учитываться ниже нуля, но да, они бы упали ниже. –