Я пишу веб-скребок в R, чтобы получить информацию о налоговой службе. Пока у меня есть этот код, который отлично работает для Аргентины. Однако для этого требуется, чтобы каждый URL был введен вручную. На сайте нет файла Sitemap или файла robots.txt, так есть ли другой способ автоматически найти соответствующий URL-адрес? Заранее спасибо.Общий веб-скребок в R
library(RCurl)
library(XML)
library(rvest)
#INFORMATION
url <- "http://oceantax.co.uk/links/tax-authorities-worldwide.html"
pg <- html(url)
country <- pg %>% html_nodes(xpath="//a[contains(@title, 'Argentina')]")
country <- pg %>% html_nodes("a[title~=Argentina]")
name <- as.matrix(country %>% html_text())
url <- as.matrix(country %>% html_attr("href"))
arginfo <- readLines("http://www.afip.gob.ar/contacto/")
strong <- as.matrix(grep("<strong>",arginfo))
#Telephone
strongphone <- grep("<strong> Centro de Información Telefónica</strong>",arginfo)
rowphone <- which(grepl(strongphone, strong))
strongphonend<- strong[rowphone + 1 ,]-1
phone <- as.matrix(arginfo[strongphone:strongphonend])
#Email
strongemail <- grep("<strong>Tema:",arginfo)
rowemail <- which(grepl(strongemail, strong))
strongemailend<- strong[rowemail + 1 ,]-1
email <- as.matrix(arginfo[strongemail:strongemailend])
info <-rbind(name,url,phone,email)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = info, sheet = "Information")
#TAX DETAILS
argtax <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")
#finds line numbers of all the headings with tag name strong and puts into a matrix
strong <-as.matrix(grep("<strong>",argtax))
#finds the header desired in the html code and which row it's on
strong1starts <- grep("<strong>Operaciones de Seguro.</strong>",argtax)
#calculates what number header it is and thus where it ends
rowst1st <- which(grepl(strong1starts, strong))
strong1ends <- strong[rowst1st + 1 ,]-1
#defines data as the text under the heading Operaciones de Seguro as required
data1 <- as.matrix(argtax[strong1starts:strong1ends])
#Repeats the process for the second text
strong2starts <- grep("<strong>Operaciones de Seguro</strong>",argtax)
rowst2st <- which(grepl(strong2starts, strong))
strong2ends <- strong[rowst2st + 1 ,]-1
data2 <- as.matrix(argtax[strong2starts:strong2ends])
#Combines the data frames as one in order to export it to excell in one peice
data<- rbind(data1,data2)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = data, sheet = "Tax Details")
#PAYMENT DETAILS
argpaym <- readLines("http://www.afip.gob.ar/futCont/otros/sistemaTributarioArgentino")
stron <-as.matrix(grep("contenidoSubTitle",argpaym))
#Determination and Collection of Taxes
stron1start <- grep('"contenidoSubTitle">Determinación y Percepción de Impuestos',argpaym)
rowst1s <- which(grepl(stron1start, stron))
stron1end <- stron[rowst1s + 1 ,]-1
dat1 <- as.matrix(argpaym[stron1start:stron1end])
#Interest, Legal and Penalties
stron2start <- grep('"contenidoSubTitle">Interés, ilícitos y sanciones',argpaym)
rowst2s <- which(grepl(stron2start, stron))
stron2end <- stron[rowst2s + 1 ,]-1
dat2 <- as.matrix(argpaym[stron2start:stron2end])
#Prescription
stron3start <- grep('"contenidoSubTitle">Prescripción',argpaym)
rowst3s <- which(grepl(stron3start, stron))
stron3end <- stron[rowst3s + 1 ,]-1
dat3 <- as.matrix(argpaym[stron3start:stron3end])
dat<- rbind(dat1,dat2,dat3)
library(XLConnect)
writeWorksheetToFile(file = "z:/My Documents/Impendulo/Argentina.xlsx", data = dat, sheet = "Payment Details")