Spark Streaming on Kafka print different cases for different values from kafka

问题

I am stating my scenario below: 10000 - Servers are sending DF size data. ( Every 5 seconds 10,000 inputs are coming )

If for any server DF size is more than 70 % print increase the ROM size by 20 % If for any server DF size used is less than 30 % print decrease the ROM size by 25 %.

I am providing a code that takes messages from kafka and matches with "%" and does to.upper(). This code is just for a reference to my kafka details.

Can anyone please help me with the scenario.

package rnd
import org.apache.spark.SparkConf
import kafka.serializer.StringDecoder
import org.apache.spark.sql.SQLContext
//import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
//import org.apache.spark.util.TimeStampedWeakValueHashMap.toWeakReference
import org.apache.spark.{SparkConf, SparkContext}


object WordFind {
    def main(args: Array[String]) {
        import org.apache.spark.SparkConf
        val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaReceiver")
        import org.apache.spark.streaming.StreamingContext
        import org.apache.spark.streaming.Seconds
        val batchIntervalSeconds = 2

        val ssc = new StreamingContext(conf, Seconds(10))
        import org.apache.spark.streaming.kafka.KafkaUtils
        import org.apache.spark.streaming.dstream.ReceiverInputDStream

        val kafkaStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, "localhost:2181","spark-streaming-consumer-group", Map("wordcounttopic" -> 5))
        import org.apache.spark.streaming.dstream.DStream

        val filteredStream: DStream[(String, String)] = kafkaStream.filter(record =>
                record._2.contains("%")) // TODO : pattern matching here
        val outputDStream: DStream[String] = filteredStream.map(record => record._2.toUpperCase())

        outputDStream.print()

        ssc.start
        ssc.awaitTerminationOrTimeout(batchIntervalSeconds * 5 * 1000)
    }
}

Please help me with the scenario satisfying code.

Sample input

Filesystem 1K-blocks Used Available Use% Mounted on /dev/sda1 132239776 6210884 119311504 5% / tmpfs 4021876 0 4021876 0% /dev/shm

Sample output: if Use%>70 for any case> Message: Increase ROM size by 20% if Use%<30% for any case> Message: Decrease ROM size by 25%

even i have to put that to Elastic search and it is giving error:

package rnd
import org.apache.spark.SparkConf
import kafka.serializer.StringDecoder
import org.apache.spark.sql.SQLContext
//import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
//import org.apache.spark.util.TimeStampedWeakValueHashMap.toWeakReference
import org.apache.spark.{SparkConf, SparkContext}
object WordFind {
  def main(args: Array[String]) {
  }
  import org.apache.spark.SparkConf
  val conf = new SparkConf().setMaster("local[*]").setAppName("KafkaReceiver")
  val sc = new SparkContext(conf)
  val checkpointDir = "/usr/local/kafka/kafka_2.11-0.11.0.2/checkpoint/"
  import org.apache.spark.streaming.StreamingContext
  import org.apache.spark.streaming.Seconds
  val batchIntervalSeconds = 2
  val ssc = new StreamingContext(conf, Seconds(10))
  import org.apache.spark.streaming.kafka.KafkaUtils
  import org.apache.spark.streaming.dstream.ReceiverInputDStream
  val ssc = new StreamingContext(sc, Seconds(batchIntervalSeconds))
  val kafkaStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(ssc, "localhost:2181",
    "spark-streaming-consumer-group", Map("wordcounttopic" -> 5))

  import org.apache.spark.streaming.dstream.DStream

  val filteredStream: DStream[Array[String]] = kafkaStream
    .filter(!_._2.contains("Filesystem"))  // eliminate header
    .map(_._2.split("\\s+"))  // split with space
  val outputDStream: DStream[String] = filteredStream.map {
    row =>
      val useIdx = row.length - 2
      // if Use%>70 for any case> Message: Increase ROM size by 20%
      // if Use%<30% for any case> Message: Decrease ROM size by 25%
      val usePercent = row(useIdx).replace("%", "").toInt
      usePercent match {
        case x if x > 70 => "Increase ROM size by 20%"
        case x if x < 30 => "Decrease ROM size by 25%"
        case _ => "Undefined"
      }

  outputDStream.print()
  import org.elasticsearch.spark.sql._
  outputDStream.saveToEs("dfvalueoperations_v1/kwc")
}
// To make sure data is not deleted by the time we query it interactively
ssc.remember(Minutes(1))
ssc.checkpoint(checkpointDir)
ssc
//    }
// This starts the streaming context in the background.
ssc.start()
// This is to ensure that we wait for some time before the background streaming job starts. This will put this cell on hold for 5 times the batchIntervalSeconds.
ssc.awaitTerminationOrTimeout(batchIntervalSeconds * 5 * 1000)
}

error: Error:(51, 21) value saveToEs is not a member of org.apache.spark.streaming.dstream.DStream[String] outputDStream.saveToEs("kafkamessage_v1/kwc")

回答1:

Have made few assumptions to get the required output.

1.) Headers may appear in between hence a filter is used to remove the header.

Filesystem 1K-blocks Used Available Use% Mounted on

2.) Since the Filesysytem column may have space inside the string, I have extracted the use% using the second index from the last. (if this doesn't work please try group regex to achieve the same)

3.) The case for use percentage between 30 and 70 is not defined, hence the output message contains "Undefined" for such cases.

A sample input output (using Array[String])

 scala> val input =
           |       """|Filesystem    512-blocks      Used Available Capacity iused      ifree %iused  Mounted on
           |          |/dev/disk1     234618880 154868528  79238352    67% 1784543 4293182736    0%   /
           |          |devfs                364       364         0   100%     630          0  100%   /dev
           |          |map -hosts             0         0         0   100%       0          0  100%   /net
           |          |map auto_home          0         0         0   100%       0          0  100%   /home""".stripMargin


 scala> val inputStr: Array[Array[String]] = input.split("\n").filter(!_.contains("Filesystem")).map(_.split("\\s+"))

 scala> val outputMessage = inputStr.map {
      |       row =>
      |         // Assuming the position is always second from last
      |         val elementPosition = row.length - 2 
      |         // if Use%>70 for any case> Message: Increase ROM size by 20%
      |         // if Use%<30% for any case> Message: Decrease ROM size by 25%
      |         val usePercent = row(elementPosition).replace("%", "").toInt
      |         usePercent match {
      |           case x if x > 70 => (usePercent, "Increase ROM size by 20%")
      |           case x if x < 30 => (usePercent, "Decrease ROM size by 25%")
      |           case _ => (usePercent, "Undefined")
      |         }
      |     }

 scala> outputMessage.foreach(println)
 (0,Decrease ROM size by 25%)
 (100,Increase ROM size by 20%)
 (100,Increase ROM size by 20%)
 (100,Increase ROM size by 20%)

This code works for a Array[String] please test it for ReceiverInputDStream[(String, String)]. The code must be similar to:

 val filteredStream: DStream[Array[String]] = kafkaStream
       .filter(!_._2.contains("Filesystem"))  // eliminate header
       .map(_._2.split("\\s+"))  // split with space
 val outputDStream: DStream[String] = filteredStream.map {
       row =>
         val useIdx = row.length - 2
         // if Use%>70 for any case> Message: Increase ROM size by 20%
         // if Use%<30% for any case> Message: Decrease ROM size by 25%
         val usePercent = row(useIdx).replace("%", "").toInt
         usePercent match {
           case x if x > 70 => "Increase ROM size by 20%"
           case x if x < 30 => "Decrease ROM size by 25%"
           case _ => "Undefined"
         }
     }

Hope this helps.

来源：https://stackoverflow.com/questions/48747604/spark-streaming-on-kafka-print-different-cases-for-different-values-from-kafka

标签

apache-spark

apache-kafka

spark-streaming