环境
spark: 2.4.3
hbase: 1.1.5
步骤
- 启动hadoop-3.1.2,hbase2.2.0
把HBase的lib目录下的一些jar文件拷贝到Spark中,这些都是编程时需要引入的jar包,需要拷贝的jar文件包括:所有hbase开头的jar文件、guava-12.0.1.jar、protobuf-java-2.5.0.jar
- hbase建表,插入数据
- idea开发spark操作hbase代码
build.sbt
- spark读取hbase数据
import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} import org.apache.spark.{SparkConf, SparkContext} object SparkOperateHBase { def main(args: Array[String]) { val conf = HBaseConfiguration.create() val sc = new SparkContext(new SparkConf()) //设置查询的表名 conf.set(TableInputFormat.INPUT_TABLE, "student") val stuRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]) val count = stuRDD.count() println("Students RDD Count:" + count) stuRDD.cache() //遍历输出 stuRDD.foreach({ case (_,result) => val key = Bytes.toString(result.getRow) val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes)) val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes)) val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes)) println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age) }) } }
- spark写入hbase数据
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.spark._ import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.util.Bytes object SparkWriteHBase { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local") val sc = new SparkContext(sparkConf) val tablename = "student" sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename) val job = new Job(sc.hadoopConfiguration) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26", "4,Guanhua,M,27")) //构建两行记录 val rdd = indataRDD.map(_.split(',')).map { arr => { val put = new Put(Bytes.toBytes(arr(0))) //行健的值 put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(arr(1))) //info:name列的值 put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("gender"), Bytes.toBytes(arr(2))) //info:gender列的值 put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(arr(3).toInt)) //info:age列的值 (new ImmutableBytesWritable, put) } } rdd.saveAsNewAPIHadoopDataset(job.getConfiguration()) } }
- sbt打包jar,发布到测试环境运行spark
- spark读取hbase
./spark-submit --driver-class-path /home/hadoop/spark-2.5.3/jars/*:/opt/hbase-2.2.0/conf --class "SparkWriteHBase" /home/hadoop/hbaseoperation_2.11-0.1.jar
- spark写入hbase
./spark-submit --driver-class-path /home/hadoop/spark-2.5.3/jars/*:/opt/hbase-2.2.0/conf --class "SparkWriteHBase" /home/hadoop/hbaseoperation_2.11-0.1.jar
结果显示:
填坑
步骤1中 把hbase相关jar复制到spark的jars下的hbase文件夹内,运行spark报错:
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/hadoop/hbase/HBaseConfiguration
解决办法:把步骤1中hbase的依赖复制到spark/jars目录