Function for logic: Geocoding script for county selection

问题

I wrote following script to get the corresponding or best county match for a given input string of a city, like "New York, NY". The desired logic is commented inline. I tried my best and made the code reproducible. You can just change the corresponding dat and place input (2.2.1) to see the way it works.

# Load Packages
library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)

# 1. Select Data

dat = geo.lookup(state = "NY", place = "New York")
#dat = geo.lookup(state = "TX", place = "Dallas")
#dat = geo.lookup(state = "OR", place = "Portland")
#dat = geo.lookup(state = "NY", place = "Manhattan")
#dat = geo.lookup(state = "NY", place = "Queens")
print(dat)

dat = na.omit(dat) # remove first row that is only contains state information and NA

# 2. Check whether county.name has multiple counties, separated by comma
cvals <- dat %>% filter(str_detect(county.name, ","), row_number() == 2L)
nrow(cvals)

# 2.1 If nrow(cvals) = 0, take first row
dat[1,]

# 2.2 If nrow(cvals) > 0, do split string and unnest
unbundle <- dat %>% 
  group_by(state.name, place.name) %>% 
  mutate(county.name = strsplit(county.name, ", ")) %>% 
  unnest %>%
  na.omit()
unbundle

# 2.2.1 If "place =" input matches a county.name in unbundle, select that row
check <- unbundle %>% filter(str_detect(county.name, "New York"))
nrow(check)
#select that row
select <- unbundle %>% filter(str_detect(county.name, "New York"))

# 2.2.2 Otherwise, if there is no match, i.e. nrow(select) = 0, take first row from unbundle by default
unbundle[1,]

# 3.1 Merge countyfips from fips_codes into selected table (For New York, the final output of 2.2.1 would have been selected)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")
select %>% left_join(fips_codes, by = c("state.name", "county.name"))

I wonder how this function can be written, so that an input, like "Portland, OR", "Queens, NY", or anything above would work. Maybe, there's also a smarter way to write the whole script. As I'm learning dplyr, dlplyr solutions are preferred.

Thanks!

Solution (updated):

library(acs)
library(tidyverse)
library(tigris)
data(fips_codes)
colnames(fips_codes) = c("state.abb", "statefips", "state.name", "countyfips", "county.name")

    FUN <- function(x) {
      Place <- strsplit(x, ", ")[[1]][1]
      State <- strsplit(x, ", ")[[1]][2]
      dat = geo.lookup(state = State, place = Place)
      dat = na.omit(dat)

      # 1 Check whether county.name has multiple counties
      cvals <- dat %>% filter(str_detect(county.name, ","))

      # 2 If not, i.e. cvals == 0, take first row of output
      if(nrow(cvals[2,]) == 0) {
        output <- dat[1,]
      }

      # 3 If yes, i.e. cvals > 0, unbundle code and proceed
      else {
        unbundle <- dat %>% 
          group_by(state.name, place.name) %>% 
          mutate(county.name = strsplit(county.name, ", ")) %>% 
          unnest %>%
          na.omit()

        # 3.1 If "Place" matches one of county.name values, take that row
        check <- unbundle %>% filter(str_detect(county.name, Place))
        nrow(check)

        if (nrow(check) > 0) {
        output <- check[1,]
      } 
        # 3.2 Otherwise, if no match, nrow(check) = 0, take first row from unbundle by default
        output <- unbundle[1,]
      }
    # Join county data with fips code table
      output <- output %>% left_join(fips_codes, by = c("state.name", "county.name"))
      print(output)
    }

    FUN("New York, NY")
    FUN("Portland, OR")
    FUN("Manhattan, NY")
    FUN("Cambridge, MA")

回答1:

Your Q is not really clear, but just use the acs data frame directly:

library(acs)
library(tidyverse)

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
    separate_rows(COUNTY, sep=", ") %>%
    head(1) %>%
    setNames(tolower(colnames(.)))

}

Some examples:

place_to_county("New York", "NY")
## # A tibble: 1 x 7
##   state statefp placefp     placename               type funcstat       county
##   <chr>   <int>   <int>         <chr>              <chr>    <chr>        <chr>
## 1    NY      36   51000 New York city Incorporated Place        A Bronx County

place_to_county("New York, NY")
## # A tibble: 1 x 7
##   state statefp placefp     placename               type funcstat       county
##   <chr>   <int>   <int>         <chr>              <chr>    <chr>        <chr>
## 1    NY      36   51000 New York city Incorporated Place        A Bronx County

place_to_county("Queens", "NY")
## # A tibble: 1 x 7
##   state statefp placefp      placename               type funcstat        county
##   <chr>   <int>   <int>          <chr>              <chr>    <chr>         <chr>
## 1    NY      36   60323 Queens borough County Subdivision        G Queens County

place_to_county("Queens, NY")
## # A tibble: 1 x 7
##   state statefp placefp      placename               type funcstat        county
##   <chr>   <int>   <int>          <chr>              <chr>    <chr>         <chr>
## 1    NY      36   60323 Queens borough County Subdivision        G Queens County

place_to_county("Berwick", "ME")
## # A tibble: 1 x 7
##   state statefp placefp   placename                    type funcstat      county
##   <chr>   <int>   <int>       <chr>                   <chr>    <chr>       <chr>
## 1    ME      23    4685 Berwick CDP Census Designated Place        S York County

place_to_county("Berwick, ME")
## # A tibble: 1 x 7
##   state statefp placefp   placename                    type funcstat      county
##   <chr>   <int>   <int>       <chr>                   <chr>    <chr>       <chr>
## 1    ME      23    4685 Berwick CDP Census Designated Place        S York County

place_to_county("Manhattan", "NY")
## # A tibble: 1 x 7
##   state statefp placefp         placename               type funcstat          county
##   <chr>   <int>   <int>             <chr>              <chr>    <chr>           <chr>
## 1    NY      36   44919 Manhattan borough County Subdivision        G New York County

place_to_county("Manhattan, NY")
## # A tibble: 1 x 7
##   state statefp placefp         placename               type funcstat          county
##   <chr>   <int>   <int>             <chr>              <chr>    <chr>           <chr>
## 1    NY      36   44919 Manhattan borough County Subdivision        G New York County

As you can see, that works if the parameters are specified separately or as an "x, y" string.

This version is a bit more robust:

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>% 
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
    head(1) %>%
    setNames(tolower(colnames(.)))
  } else {
    NULL
  }

}

as it gracefully handles complete misses.

UPDATE to address the comment (I'll use the simplified version of the code):

tbl_df(acs::fips.place) %>%
  filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) %>%
  separate_rows(COUNTY, sep=", ") %>%
  head(1) %>%
  setNames(tolower(colnames(.)))

Whereas:

%>% the pipe symbol seen in tidyverse/dplyr code. It (oversimplifying) avoids using temporary variables assignments
tbl_df(…) (again, oversimplifying) just ensures the output is more human-readable (it tags the data frame with additional classes)
filter(…) does the work you want. the grepl() case-insensitive searches for the place and then takes those matches and further refines by state. This is a "dumb" way to do it since it catches the place name anywhere in the string. There are smarter ways, but this should work pretty well.
separate_rows(…) will take the entries in fips.place that have more than one county in them and make a separate row for each county.
head(1) naively takes the first match
setNames(…) makes lower-case column names as the ones in fips.place are (ugh) all UPPER_CASE

PENULTIMATE UPDATE

This merges the tigris place_name data:

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
      head(1) %>%
      setNames(tolower(colnames(.))) %>% 
      left_join(tigris::fips_codes, by=c("county", "state"))
  } else {
    NULL
  }

}

FINAL UPDATE (for real):

This was a pretty simple addition. I'm not going to refine this further. You won't get better at coding in R w/o some experimentation on your own.

place_to_county <- function(place, state = NULL) {

  if (is.null(state)) {

    x <- trimws(strsplit(place, ",", 2)[[1]])

    place <- x[1]
    state <- x[2]

  }

  tbl_df(acs::fips.place) %>%
    filter(grepl(place, PLACENAME, ignore.case=TRUE) & STATE == state) -> xdf

  if (nrow(xdf) > 0) {
    separate_rows(xdf, COUNTY, sep=", ") %>%
      head(1) %>%
      setNames(tolower(colnames(.))) %>% 
      left_join(tigris::fips_codes, by=c("county", "state"))
  } else {
    data_frame(
      state = state, 
      statefp = NA, 
      placefp = NA, 
      placename = place, 
      type = NA, 
      funcstat = NA, 
      county = NA, 
      state_code = NA, 
      state_name = NA,
      county_code = NA
    )
  }

}

Any other features/changes are up to you.

来源：https://stackoverflow.com/questions/47569996/function-for-logic-geocoding-script-for-county-selection

标签

function

dplyr

geocoding