val df1 = sc.parallelize(Seq(
(\"a1\",10,\"ACTIVE\",\"ds1\"),
(\"a1\",20,\"ACTIVE\",\"ds1\"),
(\"a2\",50,\"ACTIVE\",\"ds1\"),
(\"a3\",60,\"ACTIVE\",\"ds1\"))
here is dirty solution -
from pyspark.sql import functions as F
# find the rows from df2 that have matching key c1 in df2
df3 = df1.join(df2,df1.c1==df2.c1)\
.select(df2.c1,df2.c2,df2.c3,df2.c5.alias('c4'))\
.dropDuplicates()
df3.show()
:
+---+---+------+---+
| c1| c2| c3| c4|
+---+---+------+---+
| a1| 10|ACTIVE|ds2|
| a1| 20|ACTIVE|ds2|
| a1| 30|ACTIVE|ds2|
| a1| 40|ACTIVE|ds2|
+---+---+------+---+
:
# Union df3 with df1 and change columns c3 and c4 if c4 value is 'ds2'
df1.union(df3).dropDuplicates(['c1','c2'])\
.select('c1','c2',\
F.when(df1.c4=='ds2','INACTIVE').otherwise('ACTIVE').alias('c3'),
F.when(df1.c4=='ds2','ds1').otherwise('ds1').alias('c4')
)\
.orderBy('c1','c2')\
.show()
:
+---+---+--------+---+
| c1| c2| c3| c4|
+---+---+--------+---+
| a1| 10| ACTIVE|ds1|
| a1| 20| ACTIVE|ds1|
| a1| 30|INACTIVE|ds1|
| a1| 40|INACTIVE|ds1|
| a2| 50| ACTIVE|ds1|
| a3| 60| ACTIVE|ds1|
+---+---+--------+---+