I have an input of 36,742 points which means if I wanted to calculate the lower triangle of a distance matrix (using the vincenty approximation) I would need to generate 36,742*
Thanks everyone's help. I think I have solved this by incorporating all the suggestions.
I use numpy to import the geographic co-ordinates and then project them using "France Lambert - 93". This lets me fill scipy.spatial.cKDTree with the points and then calculate a sparse_distance_matrix by specifying a cut-off of 50km (my projected points are in metres). I then extract extract the lower-triangle to a CSV.
import numpy as np
import csv
import time
from pyproj import Proj, transform
#http://epsg.io/2154 (accuracy: 1.0m)
fr = '+proj=lcc +lat_1=49 +lat_2=44 +lat_0=46.5 +lon_0=3 \
+x_0=700000 +y_0=6600000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 \
+units=m +no_defs'
#http://epsg.io/27700-5339 (accuracy: 1.0m)
uk = '+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 \
+x_0=400000 +y_0=-100000 +ellps=airy \
+towgs84=446.448,-125.157,542.06,0.15,0.247,0.842,-20.489 +units=m +no_defs'
path_to_csv = '.../raw_in.csv'
out_csv = '.../out.csv'
def proj_arr(points):
inproj = Proj(init='epsg:4326')
outproj = Proj(uk)
# origin|destination|lon|lat
func = lambda x: transform(inproj,outproj,x[2],x[1])
return np.array(list(map(func, points)))
tstart = time.time()
# Import points as geographic coordinates
# ID|lat|lon
#Sample to try and replicate
#points = np.array([
# [39007,46.585012,5.5857829],
# [88086,48.192370,6.7296289],
# [62627,50.309155,3.0218611],
# [14020,49.133972,-0.15851507],
# [1091, 42.981765,2.0104902]])
#
points = np.genfromtxt(path_to_csv,
delimiter=',',
skip_header=1)
print("Total points: %d" % len(points))
print("Triangular matrix contains: %d" % (len(points)*((len(points))-1)*0.5))
# Get projected co-ordinates
proj_pnts = proj_arr(points)
# Fill quad-tree
from scipy.spatial import cKDTree
tree = cKDTree(proj_pnts)
cut_off_metres = 1600
tree_dist = tree.sparse_distance_matrix(tree,
max_distance=cut_off_metres,
p=2)
# Extract triangle
from scipy import sparse
udist = sparse.tril(tree_dist, k=-1) # zero the main diagonal
print("Distances after quad-tree cut-off: %d " % len(udist.data))
# Export CSV
import csv
f = open(out_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','lat_a','lon_a','id_b','lat_b','lon_b','metres'])
w.writerows(np.column_stack((points[udist.row ],
points[udist.col],
udist.data)))
f.close()
"""
Get ID labels
"""
id_to_csv = '...id.csv'
id_labels = np.genfromtxt(id_to_csv,
delimiter=',',
skip_header=1,
dtype='U')
"""
Try vincenty on the un-projected co-ordinates
"""
from geopy.distance import vincenty
vout_csv = '.../out_vin.csv'
test_vin = np.column_stack((points[udist.row].T[1:3].T,
points[udist.col].T[1:3].T))
func = lambda x: vincenty(x[0:2],x[2:4]).m
output = list(map(func,test_vin))
# Export CSV
f = open(vout_csv, 'w', newline='')
w = csv.writer(f, delimiter=",", )
w.writerow(['id_a','id_a2', 'lat_a','lon_a',
'id_b','id_b2', 'lat_b','lon_b',
'proj_metres','vincenty_metres'])
w.writerows(np.column_stack((list(id_labels[udist.row]),
points[udist.row ],
list(id_labels[udist.col]),
points[udist.col],
udist.data,
output,
)))
f.close()
print("Finished in %.0f seconds" % (time.time()-tstart)
This approach took 164 seconds to generate (for 5,306,434 distances) - compared to 9 - and also around 90 seconds to save to disk.
I then compared the difference in the vincenty distance and the hypotenuse distance (on the projected co-ordinates).
The mean difference in metres was 2.7 and the mean difference/metres was 0.0073% - which looks great.