问题
In my code table_df
has some columns on which I am doing some calculations like min, max, mean etc. and I want to create new_df with specified schema new_df_schema. In my logic, I have written spark-sql for calculations and appending each new generated row to initially empty new_df and at the end, it results in new_df
with all calculated values for all columns.
But the problem is when the columns are more in number it leads to performance issue. Can this be done without using union() function or any other approach to increase performance?
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import sparkSession.sqlContext.implicits._
val table_df = Seq(
(10, 20, 30, 40, 50),
(100, 200, 300, 400, 500),
(111, 222, 333, 444, 555),
(1123, 2123, 3123, 4123, 5123),
(1321, 2321, 3321, 4321, 5321)
).toDF("col_1", "col_2", "col_3", "col_4", "col_5")
table_df.show(false)
table_df.createOrReplaceTempView("table_df")
val new_df_schema = StructType(
StructField("Column_Name", StringType, false) ::
StructField("number_of_values", LongType, false) ::
StructField("number_of_distinct_values", LongType, false) ::
StructField("distinct_count_with_nan", LongType, false) ::
StructField("distinct_count_without_nan", LongType, false) ::
StructField("is_unique", BooleanType, false) ::
StructField("number_of_missing_values", LongType, false) ::
StructField("percentage_of_missing_values", DoubleType, false) ::
StructField("percentage_of_unique_values", DoubleType, false) ::
StructField("05_PCT", DoubleType, false) ::
StructField("25_PCT", DoubleType, false) ::
StructField("50_PCT", DoubleType, false) ::
StructField("75_PCT", DoubleType, false) ::
StructField("95_PCT", DoubleType, false) ::
StructField("max", DoubleType, false) ::
StructField("min", DoubleType, false) ::
StructField("mean", DoubleType, false) ::
StructField("std", DoubleType, false) ::
StructField("skewness", DoubleType, false) ::
StructField("kurtosis", DoubleType, false) ::
StructField("range", DoubleType, false) ::
StructField("variance", DoubleType, false) :: Nil
)
var new_df = sparkSession.createDataFrame(sparkSession.sparkContext.emptyRDD[Row], new_df_schema)
for (c <- table_df.columns) {
val num = sparkSession.sql(
s"""SELECT
| '$c' AS Column_Name,
| COUNT(${c}) AS number_of_values,
| COUNT(DISTINCT ${c}) AS number_of_distinct_values,
| COUNT(DISTINCT ${c}) AS distinct_count_with_nan,
| (COUNT(DISTINCT ${c}) - 1) AS distinct_count_without_nan,
| (COUNT(${c}) == COUNT(DISTINCT ${c})) AS is_unique,
| (COUNT(*) - COUNT(${c})) AS number_of_missing_values,
| ((COUNT(*) - COUNT(${c}))/COUNT(*)) AS percentage_of_missing_values,
| (COUNT(DISTINCT ${c})/COUNT(*)) AS percentage_of_unique_values,
| APPROX_PERCENTILE($c,0.05) AS 05_PCT,
| APPROX_PERCENTILE($c,0.25) AS 25_PCT,
| APPROX_PERCENTILE($c,0.50) AS 50_PCT,
| APPROX_PERCENTILE($c,0.75) AS 75_PCT,
| APPROX_PERCENTILE($c,0.95) AS 95_PCT,
| MAX($c) AS max,
| MIN($c) AS min,
| MEAN($c) AS mean,
| STD($c) AS std,
| SKEWNESS($c) AS skewness,
| KURTOSIS($c) AS kurtosis,
| (MAX($c) - MIN($c)) AS range,
| VARIANCE($c) AS variance
| FROM
| table_df""".stripMargin)
.toDF()
new_df = new_df.union(num) // this results performance issue when then number of columns in table_df is more
}
new_df.show(false)
==================================================
table_df:
+-----+-----+-----+-----+-----+
|col_1|col_2|col_3|col_4|col_5|
+-----+-----+-----+-----+-----+
|10 |20 |30 |40 |50 |
|100 |200 |300 |400 |500 |
|111 |222 |333 |444 |555 |
|1123 |2123 |3123 |4123 |5123 |
|1321 |2321 |3321 |4321 |5321 |
+-----+-----+-----+-----+-----+
new_df:
+-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+------+----+------+------------------+-------------------+-------------------+------+-----------------+
|Column_Name|number_of_values|number_of_distinct_values|distinct_count_with_nan|distinct_count_without_nan|is_unique|number_of_missing_values|percentage_of_missing_values|percentage_of_unique_values|05_PCT|25_PCT|50_PCT|75_PCT|95_PCT|max |min |mean |std |skewness |kurtosis |range |variance |
+-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+------+----+------+------------------+-------------------+-------------------+------+-----------------+
|col_1 |5 |5 |5 |4 |true |0 |0.0 |1.0 |10.0 |100.0 |111.0 |1123.0|1321.0|1321.0|10.0|533.0 |634.0634826261484 |0.4334269738367067 |-1.7463346405299973|1311.0|402036.5 |
|col_2 |5 |5 |5 |4 |true |0 |0.0 |1.0 |20.0 |200.0 |222.0 |2123.0|2321.0|2321.0|20.0|977.2 |1141.1895986206673|0.4050513738738682 |-1.799741951675132 |2301.0|1302313.7 |
|col_3 |5 |5 |5 |4 |true |0 |0.0 |1.0 |30.0 |300.0 |333.0 |3123.0|3321.0|3321.0|30.0|1421.4|1649.399072389699 |0.3979251063785061 |-1.8119558312496054|3291.0|2720517.3 |
|col_4 |5 |5 |5 |4 |true |0 |0.0 |1.0 |40.0 |400.0 |444.0 |4123.0|4321.0|4321.0|40.0|1865.6|2157.926620624529 |0.39502047381456235|-1.8165124206347685|4281.0|4656647.3 |
|col_5 |5 |5 |5 |4 |true |0 |0.0 |1.0 |50.0 |500.0 |555.0 |5123.0|5321.0|5321.0|50.0|2309.8|2666.59027598917 |0.3935246673563026 |-1.8186685628112493|5271.0|7110703.699999999|
+-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+------+----+------+------------------+-------------------+-------------------+------+-----------------+
回答1:
An alternative to union
.
Check below code.
scala> df.show(false)
+-----+-----+-----+-----+-----+
|col_1|col_2|col_3|col_4|col_5|
+-----+-----+-----+-----+-----+
|10 |20 |30 |40 |50 |
|100 |200 |300 |400 |500 |
|111 |222 |333 |444 |555 |
|1123 |2123 |3123 |4123 |5123 |
|1321 |2321 |3321 |4321 |5321 |
+-----+-----+-----+-----+-----+
Build Required Expressions.
scala> val descExpr = array(
df.columns
.map(c => struct(
lit(c).cast("string").as("column_name"),
max(col(c)).cast("string").as("max"),
min(col(c)).cast("string").as("min"),
mean(col(c)).cast("string").as("mean"),
stddev(col(c)).cast("string").as("std"),
skewness(col(c)).cast("string").as("skewness"),
kurtosis(col(c)).cast("string").as("kurtosis")
)
):_*
).as("data")
Required Columns.
val columns = Seq("column_name","max","min","mean","std","skewness","kurtosis")
.map(c => if(c != "column_name") col(c).cast("double").as(c) else col(c))```
Final Output
scala> df
.select(descExpr)
.selectExpr("explode(data) as data")
.select("data.*")
.select(columns:_*)
.show(false)
+-----------+------+----+------+------------------+-------------------+-------------------+
|column_name|max |min |mean |std |skewness |kurtosis |
+-----------+------+----+------+------------------+-------------------+-------------------+
|col_1 |1321.0|10.0|533.0 |634.0634826261484 |0.43342697383670664|-1.7463346405299978|
|col_2 |2321.0|20.0|977.2 |1141.1895986206673|0.4050513738738679 |-1.7997419516751327|
|col_3 |3321.0|30.0|1421.4|1649.3990723896993|0.397925106378506 |-1.8119558312496056|
|col_4 |4321.0|40.0|1865.6|2157.9266206245293|0.3950204738145622 |-1.8165124206347691|
|col_5 |5321.0|50.0|2309.8|2666.5902759891706|0.3935246673563026 |-1.81866856281125 |
+-----------+------+----+------+------------------+-------------------+-------------------+
Updated
scala> val finalDF = df.select(descExpr).selectExpr("explode(data) as data").select("data.*").select(columns:_*)
Create new dataframe with Approx Quantile
for all columns.
scala> val approxQuantileDF = df
.columns
.map(c => (c,df.stat.approxQuantile(c,Array(0.25,0.5,0.75),0.0)))
.toList
.toDF("column_name","approx_quantile")
scala> finalDF
.join(approxQuantileDF,
Seq("column_name"),
"left"
).show(false)
+-----------+------+----+------+------------------+-------------------+-------------------+----------------------+
|column_name|max |min |mean |std |skewness |kurtosis |approx_quantile |
+-----------+------+----+------+------------------+-------------------+-------------------+----------------------+
|col_1 |1321.0|10.0|533.0 |634.0634826261484 |0.43342697383670664|-1.7463346405299978|[100.0, 111.0, 1123.0]|
|col_2 |2321.0|20.0|977.2 |1141.1895986206673|0.4050513738738679 |-1.7997419516751327|[200.0, 222.0, 2123.0]|
|col_3 |3321.0|30.0|1421.4|1649.3990723896993|0.397925106378506 |-1.8119558312496056|[300.0, 333.0, 3123.0]|
|col_4 |4321.0|40.0|1865.6|2157.9266206245293|0.3950204738145622 |-1.8165124206347691|[400.0, 444.0, 4123.0]|
|col_5 |5321.0|50.0|2309.8|2666.5902759891706|0.3935246673563026 |-1.81866856281125 |[500.0, 555.0, 5123.0]|
+-----------+------+----+------+------------------+-------------------+-------------------+----------------------+
回答2:
Another alternative -
there is a summary()
api inside dataset which computes basicStats in the below format-
ds.summary("count", "min", "25%", "75%", "max").show()
// output:
// summary age height
// count 10.0 10.0
// min 18.0 163.0
// 25% 24.0 176.0
// 75% 32.0 180.0
// max 92.0 192.0
Similarly, You can enrich the dataframe apis to get the stats in the format you required as below-
Define RichDataframe
& implicits
to use
import org.apache.spark.sql.functions.expr
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{NumericType, StringType, StructField, StructType}
import scala.language.implicitConversions
class RichDataFrame(ds: DataFrame) {
def statSummary(statistics: String*): DataFrame = {
val defaultStatistics = Seq("max", "min", "mean", "std", "skewness", "kurtosis")
val statFunctions = if (statistics.nonEmpty) statistics else defaultStatistics
val selectedCols = ds.schema
.filter(a => a.dataType.isInstanceOf[NumericType] || a.dataType.isInstanceOf[StringType])
.map(_.name)
val percentiles = statFunctions.filter(a => a.endsWith("%")).map { p =>
try {
p.stripSuffix("%").toDouble / 100.0
} catch {
case e: NumberFormatException =>
throw new IllegalArgumentException(s"Unable to parse $p as a percentile", e)
}
}
require(percentiles.forall(p => p >= 0 && p <= 1), "Percentiles must be in the range [0, 1]")
val aggExprs = selectedCols.flatMap(c => {
var percentileIndex = 0
statFunctions.map { stats =>
if (stats.endsWith("%")) {
val index = percentileIndex
percentileIndex += 1
expr(s"cast(percentile_approx($c, array(${percentiles.mkString(", ")}))[$index] as string)")
} else {
expr(s"cast($stats($c) as string)")
}
}
})
val aggResult = ds.select(aggExprs: _*).head()
val r = aggResult.toSeq.grouped(statFunctions.length).toArray
.zip(selectedCols)
.map{case(seq, column) => column +: seq }
.map(Row.fromSeq)
val output = StructField("columns", StringType) +: statFunctions.map(c => StructField(c, StringType))
val spark = ds.sparkSession
spark.createDataFrame(spark.sparkContext.parallelize(r), StructType(output))
}
}
object RichDataFrame {
trait Enrichment {
implicit def enrichMetadata(ds: DataFrame): RichDataFrame =
new RichDataFrame(ds)
}
object implicits extends Enrichment
}
Test with the provided test data as below
val table_df = Seq(
(10, 20, 30, 40, 50),
(100, 200, 300, 400, 500),
(111, 222, 333, 444, 555),
(1123, 2123, 3123, 4123, 5123),
(1321, 2321, 3321, 4321, 5321)
).toDF("col_1", "col_2", "col_3", "col_4", "col_5")
table_df.show(false)
table_df.printSchema()
/**
* +-----+-----+-----+-----+-----+
* |col_1|col_2|col_3|col_4|col_5|
* +-----+-----+-----+-----+-----+
* |10 |20 |30 |40 |50 |
* |100 |200 |300 |400 |500 |
* |111 |222 |333 |444 |555 |
* |1123 |2123 |3123 |4123 |5123 |
* |1321 |2321 |3321 |4321 |5321 |
* +-----+-----+-----+-----+-----+
*
* root
* |-- col_1: integer (nullable = false)
* |-- col_2: integer (nullable = false)
* |-- col_3: integer (nullable = false)
* |-- col_4: integer (nullable = false)
* |-- col_5: integer (nullable = false)
*/
import RichDataframe.implicits._
table_df.statSummary()
.show(false)
/**
* +-------+----+---+------+------------------+------------------+-------------------+
* |columns|max |min|mean |std |skewness |kurtosis |
* +-------+----+---+------+------------------+------------------+-------------------+
* |col_1 |1321|10 |533.0 |634.0634826261484 |0.4334269738367066|-1.7463346405299973|
* |col_2 |2321|20 |977.2 |1141.1895986206675|0.405051373873868 |-1.7997419516751323|
* |col_3 |3321|30 |1421.4|1649.399072389699 |0.3979251063785061|-1.8119558312496056|
* |col_4 |4321|40 |1865.6|2157.926620624529 |0.3950204738145622|-1.816512420634769 |
* |col_5 |5321|50 |2309.8|2666.5902759891706|0.3935246673563024|-1.81866856281125 |
* +-------+----+---+------+------------------+------------------+-------------------+
*/
you can also specify the functions as you want as below
import RichDataframe.implicits._
table_df.statSummary("sum", "count", "25%", "75%")
.show(false)
/**
* +-------+-----+-----+---+----+
* |columns|sum |count|25%|75% |
* +-------+-----+-----+---+----+
* |col_1 |2665 |5 |100|1123|
* |col_2 |4886 |5 |200|2123|
* |col_3 |7107 |5 |300|3123|
* |col_4 |9328 |5 |400|4123|
* |col_5 |11549|5 |500|5123|
* +-------+-----+-----+---+----+
*/
Update- As per ask the new DF can be computed as below
val count_star = table_df.count()
table_df.statSummary("count", "approx_count_distinct", "5%", "25%", "50%", "75%", "95%",
"max", "min", "mean", "std", "SKEWNESS", "KURTOSIS", "VARIANCE")
.withColumn("count_star", lit(count_star))
.selectExpr(
"columns AS Column_Name",
"COUNT AS number_of_values",
"approx_count_distinct AS number_of_distinct_values",
"approx_count_distinct AS distinct_count_with_nan",
"(approx_count_distinct - 1) AS distinct_count_without_nan",
"(count == approx_count_distinct) AS is_unique",
"(count_star - count) AS number_of_missing_values",
"((count_star - count)/count) AS percentage_of_missing_values",
"(approx_count_distinct/count) AS percentage_of_unique_values",
"`5%` AS 05_PCT",
"`25%` AS 25_PCT",
"`50%` AS 50_PCT",
"`75%` AS 75_PCT",
"`95%` AS 95_PCT",
"MAX AS max",
"MIN AS min",
"MEAN AS mean",
"STD AS std",
"SKEWNESS AS skewness",
"KURTOSIS AS kurtosis",
"(MAX - MIN) AS range",
"VARIANCE AS variance"
).show(false)
/**
* +-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+----+---+------+------------------+------------------+-------------------+------+------------------+
* |Column_Name|number_of_values|number_of_distinct_values|distinct_count_with_nan|distinct_count_without_nan|is_unique|number_of_missing_values|percentage_of_missing_values|percentage_of_unique_values|05_PCT|25_PCT|50_PCT|75_PCT|95_PCT|max |min|mean |std |skewness |kurtosis |range |variance |
* +-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+----+---+------+------------------+------------------+-------------------+------+------------------+
* |col_1 |5 |5 |5 |4.0 |true |0.0 |0.0 |1.0 |10 |100 |111 |1123 |1321 |1321|10 |533.0 |634.0634826261484 |0.4334269738367066|-1.7463346405299973|1311.0|402036.5 |
* |col_2 |5 |5 |5 |4.0 |true |0.0 |0.0 |1.0 |20 |200 |222 |2123 |2321 |2321|20 |977.2 |1141.1895986206675|0.405051373873868 |-1.7997419516751323|2301.0|1302313.7000000002|
* |col_3 |5 |5 |5 |4.0 |true |0.0 |0.0 |1.0 |30 |300 |333 |3123 |3321 |3321|30 |1421.4|1649.399072389699 |0.3979251063785061|-1.8119558312496056|3291.0|2720517.3 |
* |col_4 |5 |5 |5 |4.0 |true |0.0 |0.0 |1.0 |40 |400 |444 |4123 |4321 |4321|40 |1865.6|2157.926620624529 |0.3950204738145622|-1.816512420634769 |4281.0|4656647.3 |
* |col_5 |5 |5 |5 |4.0 |true |0.0 |0.0 |1.0 |50 |500 |555 |5123 |5321 |5321|50 |2309.8|2666.5902759891706|0.3935246673563024|-1.81866856281125 |5271.0|7110703.7 |
* +-----------+----------------+-------------------------+-----------------------+--------------------------+---------+------------------------+----------------------------+---------------------------+------+------+------+------+------+----+---+------+------------------+------------------+-------------------+------+------------------+
*/
回答3:
def main(args: Array[String]): Unit = {
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import sparkSession.sqlContext.implicits._
val df = Seq(
(10, 20, 30, 40, 50),
(100, 200, 300, 400, 500),
(10, 222, 333, 444, 555),
(1123, 2123, 3123, 4123, 5123),
(1321, 2321, 3321, 4321, 5321)
).toDF("col_1", "col_2", "col_3", "col_4", "col_5")
df.show(false)
val descExpr = array(
df.columns.map(c => struct(
lit(c).cast("string").as("Column_Name"),
count(col(c)).cast("string").as("number_of_values"),
countDistinct(col(c)).cast("string").as("number_of_distinct_values"),
countDistinct(col(c)).cast("string").as("distinct_count_with_nan"),
(countDistinct(col(c)) - 1).cast("string").as("distinct_count_without_nan"),
(count(col(c)) === countDistinct(col(c))).cast("string").as("is_unique"),
(count("*") - count(col(c))).cast("string").as("number_of_missing_values"),
((count("*") - count(col(c))) / count("*")).cast("string").as("percentage_of_missing_values"),
(countDistinct(col(c)) / count("*")).cast("string").as("percentage_of_unique_values"),
max(col(c)).cast("string").as("max"),
min(col(c)).cast("string").as("min"),
mean(col(c)).cast("string").as("mean"),
(max(col(c)) - min(col(c))).cast("string").as("range"),
stddev(col(c)).cast("string").as("std"),
skewness(col(c)).cast("string").as("skewness"),
kurtosis(col(c)).cast("string").as("kurtosis"),
variance(col(c)).cast("string").as("variance")
)
): _*
).as("data")
val columns = Seq("Column_Name", "number_of_values", "number_of_distinct_values", "distinct_count_with_nan", "distinct_count_without_nan", "is_unique", "number_of_missing_values", "percentage_of_missing_values", "percentage_of_unique_values", "max", "min", "mean", "range", "std", "skewness", "kurtosis", "variance")
.map(c => if (c != "Column_Name" && c != "is_unique") col(c).cast("double").as(c) else col(c))
var df1 = df
.select(descExpr)
.selectExpr("explode(data) as data")
.select("data.*")
.select(columns: _*)
df1 = df1
.withColumn("is_unique", col("is_unique").cast(BooleanType))
val approxQuantileDF = df
.columns
.map(c => (c, df.stat.approxQuantile(c, Array(0.25, 0.5, 0.75), 0.0)))
.toList
.toDF("Column_Name", "approx_quantile")
.select(
expr("Column_Name"),
expr("approx_quantile[0] as 05_PCT"),
expr("approx_quantile[1] as 25_PCT"),
expr("approx_quantile[2] as 50_PCT"),
expr("approx_quantile[3] as 75_PCT"),
expr("approx_quantile[4] as 95_PCT")
)
df1 = df1.join(approxQuantileDF, Seq("Column_Name"), "left")
df1.show(false)
}
I hope this will help. This code is just an extension to the answer provided by @Srinivas
来源:https://stackoverflow.com/questions/62907356/in-spark-is-their-any-alternative-for-union-function-while-appending-new-row