I have a correlation matrix calculated as follow on pyspark 2.2:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.m
You are almost there ! There is no need to use old rdd mllib api .
This is my method to generate pandas dataframe, you can export to excel or csv or others format.
def correlation_matrix(df, corr_columns, method='pearson'):
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=corr_columns, outputCol=vector_col)
df_vector = assembler.transform(df).select(vector_col)
matrix = Correlation.corr(df_vector, vector_col, method)
result = matrix.collect()[0]["pearson({})".format(vector_col)].values
return pd.DataFrame(result.reshape(-1, len(corr_columns)), columns=corr_columns, index=corr_columns)