Spark读写Hbase数据 | 易学教程

环境

spark: 2.4.3

hbase: 1.1.5

步骤

启动hadoop-3.1.2,hbase2.2.0

把HBase的lib目录下的一些jar文件拷贝到Spark中，这些都是编程时需要引入的jar包，需要拷贝的jar文件包括：所有hbase开头的jar文件、guava-12.0.1.jar、protobuf-java-2.5.0.jar

hbase建表，插入数据

idea开发spark操作hbase代码

build.sbt

spark读取hbase数据

 import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.{SparkConf, SparkContext} object SparkOperateHBase {   def main(args: Array[String]) {     val conf = HBaseConfiguration.create()     val sc = new SparkContext(new SparkConf())     //设置查询的表名     conf.set(TableInputFormat.INPUT_TABLE, "student")     val stuRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],       classOf[org.apache.hadoop.hbase.client.Result])     val count = stuRDD.count()     println("Students RDD Count:" + count)     stuRDD.cache()      //遍历输出     stuRDD.foreach({ case (_,result) =>       val key = Bytes.toString(result.getRow)       val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))       val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))       val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))       println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)     })    } }

spark写入hbase数据

 import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.spark._ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.util.Bytes  object SparkWriteHBase {   def main(args: Array[String]): Unit = {     val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")     val sc = new SparkContext(sparkConf)     val tablename = "student"     sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)      val job = new Job(sc.hadoopConfiguration)     job.setOutputKeyClass(classOf[ImmutableBytesWritable])     job.setOutputValueClass(classOf[Result])     job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])      val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26", "4,Guanhua,M,27")) //构建两行记录     val rdd = indataRDD.map(_.split(',')).map { arr => {       val put = new Put(Bytes.toBytes(arr(0))) //行健的值       put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(arr(1))) //info:name列的值       put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("gender"), Bytes.toBytes(arr(2))) //info:gender列的值       put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(arr(3).toInt)) //info:age列的值       (new ImmutableBytesWritable, put)     }     }     rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())    } }

sbt打包jar,发布到测试环境运行spark

spark读取hbase

./spark-submit --driver-class-path /home/hadoop/spark-2.5.3/jars/*:/opt/hbase-2.2.0/conf --class "SparkWriteHBase"  /home/hadoop/hbaseoperation_2.11-0.1.jar

spark写入hbase

./spark-submit --driver-class-path /home/hadoop/spark-2.5.3/jars/*:/opt/hbase-2.2.0/conf --class "SparkWriteHBase"  /home/hadoop/hbaseoperation_2.11-0.1.jar