I have a huge collection of points - and I want to determine the closest city to each point. How can I do this with BigQuery?
This is the best performing query we've worked out so far:
WITH a AS (
# a table with points around the world
SELECT * FROM UNNEST([ST_GEOGPOINT(-70, -33), ST_GEOGPOINT(-122,37), ST_GEOGPOINT(151,-33)]) my_point
), b AS (
# any table with cities world locations
SELECT *, ST_GEOGPOINT(lon,lat) latlon_geo
FROM `fh-bigquery.geocode.201806_geolite2_latlon_redux`
)
SELECT my_point, city_name, subdivision_1_name, country_name, continent_name
FROM (
SELECT loc.*, my_point
FROM (
SELECT ST_ASTEXT(my_point) my_point, ANY_VALUE(my_point) geop
, ARRAY_AGG( # get the closest city
STRUCT(city_name, subdivision_1_name, country_name, continent_name)
ORDER BY ST_DISTANCE(my_point, b.latlon_geo) LIMIT 1
)[SAFE_OFFSET(0)] loc
FROM a, b
WHERE ST_DWITHIN(my_point, b.latlon_geo, 100000) # filter to only close cities
GROUP BY my_point
)
)
GROUP BY 1,2,3,4,5
I have a huge collection of points ...
Felipe's solution is perfect in many ways but I found that in cases when you really have not just few points to search for closest city and you cannot limit yourself with 60 miles distance below solution works much better
#standardSQL
WITH a AS (
# a table with points around the world
SELECT ST_GEOGPOINT(lon,lat) my_point
FROM `fh-bigquery.geocode.201806_geolite2_latlon_redux`
), b AS (
# any table with cities world locations
SELECT *, ST_GEOGPOINT(lon,lat) latlon_geo, ST_ASTEXT(ST_GEOGPOINT(lon,lat)) hsh
FROM `fh-bigquery.geocode.201806_geolite2_latlon_redux`
)
SELECT AS VALUE
ARRAY_AGG(
STRUCT(my_point, city_name, subdivision_1_name, country_name, continent_name)
LIMIT 1
)[OFFSET(0)]
FROM (
SELECT my_point, ST_ASTEXT(closest) hsh
FROM a, (SELECT ST_UNION_AGG(latlon_geo) arr FROM b),
UNNEST([ST_CLOSESTPOINT(arr, my_point)]) closest
)
JOIN b
USING(hsh)
GROUP BY ST_ASTEXT(my_point)
Note:
not just few points ...
I am using same table as in in b
so there are 100K points to search closest city for and also no limits on how close or far lookup city can be (for this scenario - query in original answer will end up with famous Query exceeded resource limits
- while otherwise it shows much better if not best performance as it is truly stated in that answer)