问题
I wrote following script to get the corresponding or best county match for a given input string of a city, like "New York, NY". The desired logic is commented inline. I tried my best and made the code reproducible. You can just change the corresponding dat
and place
input (2.2.1) to see the way it works.
# Load Packages
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
# 1. Select Data
dat = geo.lookup(state = "NY", place = "New York")
#dat = geo.lookup(state = "TX", place = "Dallas")
#dat = geo.lookup(state = "OR", place = "Portland")
#dat = geo.lookup(state = "NY", place = "Manhattan")
#dat = geo.lookup(state = "NY", place = "Queens")
print(dat)
dat = na.omit(dat) # remove first row that is only contains state information and NA
# 2. Check whether county.name has multiple counties, separated by comma
cvals <- dat %>% filter(str_detect(county.name, ","), row_number() == 2L)
nrow(cvals)
# 2.1 If nrow(cvals) = 0, take first row
dat[1,]
# 2.2 If nrow(cvals) > 0, do split string and unnest
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
unbundle
# 2.2.1 If "place =" input matches a county.name in unbundle, select that row
check <- unbundle %>% filter(str_detect(county.name, "New York"))
nrow(check)
#select that row
select <- unbundle %>% filter(str_detect(county.name, "New York"))
# 2.2.2 Otherwise, if there is no match, i.e. nrow(select) = 0, take first row from unbundle by default
unbundle[1,]
# 3.1 Merge countyfips from fips_codes into selected table (For New York, the final output of 2.2.1 would have been selected)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
select %>% left_join(fips_codes, by = c("state.name", "county.name"))
I wonder how this function can be written, so that an input, like "Portland, OR", "Queens, NY", or anything above would work. Maybe, there's also a smarter way to write the whole script. As I'm learning dplyr, dlplyr solutions are preferred.
Thanks!
Solution (updated):
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
FUN <- function(x) {
Place <- strsplit(x, ", ")[[1]][1]
State <- strsplit(x, ", ")[[1]][2]
dat = geo.lookup(state = State, place = Place)
dat = na.omit(dat)
# 1 Check whether county.name has multiple counties
cvals <- dat %>% filter(str_detect(county.name, ","))
# 2 If not, i.e. cvals == 0, take first row of output
if(nrow(cvals[2,]) == 0) {
output <- dat[1,]
}
# 3 If yes, i.e. cvals > 0, unbundle code and proceed
else {
unbundle <- dat %>%
group_by(state.name, place.name) %>%
mutate(county.name = strsplit(county.name, ", ")) %>%
unnest %>%
na.omit()
# 3.1 If "Place" matches one of county.name values, take that row
check <- unbundle %>% filter(str_detect(county.name, Place))
nrow(check)
if (nrow(check) > 0) {
output <- check[1,]
}
# 3.2 Otherwise, if no match, nrow(check) = 0, take first row from unbundle by default
output <- unbundle[1,]
}
# Join county data with fips code table
output <- output %>% left_join(fips_codes, by = c("state.name", "county.name"))
print(output)
}
FUN("New York, NY")
FUN("Portland, OR")
FUN("Manhattan, NY")
FUN("Cambridge, MA")
回答1:
Your Q is not really clear, but just use the acs
data frame directly:
library(acs)
library(tidyverse)
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
}
Some examples:
place_to_county("New York", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("New York, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 51000 New York city Incorporated Place A Bronx County
place_to_county("Queens", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Queens, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 60323 Queens borough County Subdivision G Queens County
place_to_county("Berwick", "ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Berwick, ME")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 ME 23 4685 Berwick CDP Census Designated Place S York County
place_to_county("Manhattan", "NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
place_to_county("Manhattan, NY")
## # A tibble: 1 x 7
## state statefp placefp placename type funcstat county
## <chr> <int> <int> <chr> <chr> <chr> <chr>
## 1 NY 36 44919 Manhattan borough County Subdivision G New York County
As you can see, that works if the parameters are specified separately or as an "x, y" string.
This version is a bit more robust:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
} else {
NULL
}
}
as it gracefully handles complete misses.
UPDATE to address the comment (I'll use the simplified version of the code):
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
separate_rows(COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.)))
Whereas:
%>%
the pipe symbol seen in tidyverse/dplyr code. It (oversimplifying) avoids using temporary variables assignmentstbl_df(…)
(again, oversimplifying) just ensures the output is more human-readable (it tags the data frame with additional classes)filter(…)
does the work you want. thegrepl()
case-insensitive searches for the place and then takes those matches and further refines by state. This is a "dumb" way to do it since it catches the place name anywhere in the string. There are smarter ways, but this should work pretty well.separate_rows(…)
will take the entries infips.place
that have more than one county in them and make a separate row for each county.head(1)
naively takes the first matchsetNames(…)
makes lower-case column names as the ones infips.place
are (ugh) all UPPER_CASE
PENULTIMATE UPDATE
This merges the tigris
place_name
data:
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
NULL
}
}
FINAL UPDATE (for real):
This was a pretty simple addition. I'm not going to refine this further. You won't get better at coding in R w/o some experimentation on your own.
place_to_county <- function(place, state = NULL) {
if (is.null(state)) {
x <- trimws(strsplit(place, ",", 2)[[1]])
place <- x[1]
state <- x[2]
}
tbl_df(acs::fips.place) %>%
filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf
if (nrow(xdf) > 0) {
separate_rows(xdf, COUNTY, sep=", ") %>%
head(1) %>%
setNames(tolower(colnames(.))) %>%
left_join(tigris::fips_codes, by=c("county", "state"))
} else {
data_frame(
state = state,
statefp = NA,
placefp = NA,
placename = place,
type = NA,
funcstat = NA,
county = NA,
state_code = NA,
state_name = NA,
county_code = NA
)
}
}
Any other features/changes are up to you.
来源:https://stackoverflow.com/questions/47569996/function-for-logic-geocoding-script-for-county-selection