Вот способ, чтобы получить все поля с атрибутами с помощью xml2
. Он также использует fromJSON
для анализа поля встроенного адреса, поскольку это JSON, и регулярные выражения могут не работать для других файлов данных, если это поле добавляет к нему некоторые выигрышные биты.
library(xml2)
library(dplyr)
library(jsonlite)
library(pbapply) # some ops take a while & progress bars (like fezzes) are cool
# parse the doc
doc <- read_xml("getdata-data-restaurants.xml")
# extract the individual rows
rows <- xml_find_all(doc, "//response/row/row")
# extract "easy" fields
fields <- data.frame(pbsapply(c("name", "zipcode", "neighborhood",
"councildistrict", "policedistrict"),
function(x) { xml_text(xml_find_all(rows, x)) }),
stringsAsFactors=FALSE)
# alternate method
# fields <- data_frame(name=xml_text(xml_find_all(rows, "//name")),
# zipcode=xml_text(xml_find_all(rows, "//zipcode")),
# neighborhood=xml_text(xml_find_all(rows, "//neighborhood")),
# councildistrict=xml_text(xml_find_all(rows, "//councildistrict")),
# policedistrict=xml_text(xml_find_all(rows, "//policedistrict")))
# extract the attributes from <row>
row_attrs <- bind_rows(lapply(xml_attrs(rows), as.list))
# extract the attributes from <location_1>
loc_attrs <- pblapply(xml_attrs(xml_find_all(rows, "//location_1")), as.list)
# since each one is a JSON encoded string, use fromJSON to convert
# and then extract them all into a data frame
human_address <- bind_rows(pblapply(loc_attrs,
function(x) data.frame(fromJSON(x$human_addres),
stringsAsFactors=FALSE)))
# bind them all together
dat <- bind_cols(row_attrs, human_address, fields)
# take a look
glimpse(dat)
## Observations: 1,327
## Variables: 13
## $ _id (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _uuid (chr) "93CACF6F-C8C2-4B87-95A8-8177806D5A6F", "...
## $ _position (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _address (chr) "http://data.baltimorecity.gov/resource/k...
## $ address (chr) "4509 BELAIR ROAD", "1919 FLEET ST", "284...
## $ city (chr) "Baltimore", "Baltimore", "Baltimore", "B...
## $ state (chr) "MD", "MD", "MD", "MD", "MD", "MD", "MD",...
## $ zip (chr) "", "", "", "", "", "", "", "", "", "", "...
## $ name (chr) "410", "1919", "SAUTE", "#1 CHINESE KITCH...
## $ zipcode (chr) "21206", "21231", "21224", "21211", "2122...
## $ neighborhood (chr) "Frankford", "Fells Point", "Canton", "Ha...
## $ councildistrict (chr) "2", "1", "1", "14", "9", "14", "13", "7"...
## $ policedistrict (chr) "NORTHEASTERN", "SOUTHEASTERN", "SOUTHEAS...
# do filtering in R (dplyr is super gd for this)
dat %>%
filter(zipcode=="21231") %>%
select(name, address, city, state, zip)
## Source: local data frame [127 x 5]
##
## name address city state zip
## (chr) (chr) (chr) (chr) (chr)
## 1 1919 1919 FLEET ST Baltimore MD
## 2 300 SOUTH ANN STREET 300 ANN ST Baltimore MD
## 3 ADMIRAL FELL INN 818 BROADWAY Baltimore MD
## 4 ALE MARY'S 1939 FLEET ST Baltimore MD
## 5 ALEXANDER'S TAVERN 710 BROADWAY Baltimore MD
## 6 BERTHA'S RESTAURANT 734 BROADWAY Baltimore MD
## 7 BIRDS OF A FEATHER 1712 ALICEANNA ST Baltimore MD
## 8 BLARNEY STONE PUB 704 BROADWAY Baltimore MD
## 9 ARCOS 129 BROADWAY Baltimore MD
## 10 ARIZONA BAR & GRILL 25 BROADWAY Baltimore MD
## .. ... ... ... ... ...
Я не стал с needs_recoding
, так как он всегда был true
в этом случае, но если вам нужно, что это легко добавить.