HBase与MapReduce的集成

折月煮酒 提交于 2019-12-21 10:04:59

HBase当中的数据最终都是存储在HDFS上面的,HBase天生的支持MR的操作,我们可以通过MR直接处理HBase当中的数据,并且MR可以将处理后的结果直接存储到HBase当中去

一、读取myuser这张表当中的数据写入到HBase的另外一张表当中去

读取HBase当中一张表的数据,然后将数据写入到HBase当中的另外一张表当中去。注意:我们可以使用TableMapper与TableReducer来实现从HBase当中读取与写入数据

将myuser这张表当中f1列族的name和age字段写入到myuser2这张表的f1列族当中去

1、创建myuser2这张表

hbase(main):010:0> create 'myuser2','f1'

2、创建maven工程,导入jar包

<repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
    </repositories>

    <dependencies>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0-mr1-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.2.0-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>1.2.0-cdh5.14.0</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.testng</groupId>
            <artifactId>testng</artifactId>
            <version>6.14.3</version>
            <scope>test</scope>
        </dependency>


    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                    <!--    <verbal>true</verbal>-->
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.2</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*/RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

3、开发MR的程序 

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class HBaseMR extends Configured implements Tool {

    public static class HBaseMapper extends TableMapper<Text,Put>{
        /**
         * @param key   主键rowkey
         * @param value 一行数据所有列的值都封装在value里面了
         * @param context
         * @throws IOException
         * @throws InterruptedException
         */
        @Override
        protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {

            String rowKey = Bytes.toString(key.get());
            Put put = new Put(key.get());
            Cell[] cells = value.rawCells();
            for (Cell cell : cells) {
                if ("f1".equals(Bytes.toString(CellUtil.cloneFamily(cell)))){
                    if ("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
                     put.add(cell);
                    }
                    if ("age".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
                     put.add(cell);
                    }
                }
            }
            if (!put.isEmpty()){
                context.write(new Text(rowKey),put);
            }
        }
    }

    public static class HBaseReducer extends TableReducer<Text,Put,ImmutableBytesWritable> {

        @Override
        protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
            for (Put value : values) {
                context.write(null,value);
            }
        }
    }

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf());
        job.setJarByClass(this.getClass());

        Scan scan = new Scan();
        //使用TableMapReduceUtil 工具类来初始化我们的mapper    TableMapReduceUtil.initTableMapperJob(TableName.valueOf("myuser2"),scan,HBaseMapper.class,Text.class,Put.class,job);
        //使用TableMapReduceUtil 工具类来初始化我们的reducer
        TableMapReduceUtil.initTableReducerJob("myuser4",HBaseReducer.class,job);

        job.setNumReduceTasks(1);
        return job.waitForCompletion(true)?0:1;

    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        System.exit(ToolRunner.run(conf,new HBaseMR(),args));
    }

}

4、打包运行

需要使用打包插件,将HBase的依赖jar包都打入到工程jar包里面去

然后执行

yarn jar hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseMR

或者我们也可以自己设置我们的环境变量

export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/

export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/

export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`

yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseMR

二、读取HDFS文件,写入到HBase表当中去

1、准备数据文件,并将数据文件上传到HDFS上面去

hdfs dfs -mkdir -p /hbase/input

cd /export/servers/

vim user.txt

0007    zhangsan    18
0008    lisi    25
0009    wangwu    20

2、开发MR程序

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class Hdfs2Hbase extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(super.getConf(), "hdfs2Hbase");
        job.setJarByClass(Hdfs2Hbase.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));
        job.setMapperClass(HdfsMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        TableMapReduceUtil.initTableReducerJob("myuser2",HBaseReducer.class,job);
        job.setNumReduceTasks(1);
        boolean b = job.waitForCompletion(true);

        return b?0:1;
    }


    public static void main(String[] args) throws Exception {
        Configuration conf = HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        int run = ToolRunner.run(conf, new Hdfs2Hbase(), args);
        System.exit(run);
    }


    public static class HdfsMapper extends Mapper<LongWritable,Text,Text,NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            context.write(value,NullWritable.get());
        }
    }

    public static class HBaseReducer extends TableReducer<Text,NullWritable,ImmutableBytesWritable> {

        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            String[] split = key.toString().split("\t");
            Put put = new Put(Bytes.toBytes(split[0]));
            put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
            put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.parseInt(split[2])));
            context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
        }
    }

}

三、读取HBase的表数据,然后将数据写入到hdfs上面去

四、通过bulkload的方式批量加载数据到HBase当中去

加载数据到HBase当中去的方式多种多样,我们可以使用HBase的javaAPI或者使用sqoop将我们的数据写入或者导入到HBase当中去,但是这些方式不是慢就是在导入的过程的占用Region资料导致效率低下,我们也可以通过MR的程序,将我们的数据直接转换成HBase的最终存储格式HFile,然后直接load数据到HBase当中去即可

HBase中每张Table在根目录(/HBase)下用一个文件夹存储,Table名为文件夹名,在Table文件夹下每个Region同样用一个文件夹存储,每个Region文件夹下的每个列族也用文件夹存储,而每个列族下存储的就是一些HFile文件,HFile就是HBase数据在HFDS下存储格式,所以HBase存储文件最终在hdfs上面的表现形式就是HFile,如果我们可以直接将数据转换为HFile的格式,那么我们的HBase就可以直接读取加载HFile格式的文件,就可以直接读取

优点:

1.导入过程不占用Region资源

2.能快速导入海量的数据

3.节省内存

HBase数据正常读写流程

 

使用bulkload的方式将我们的数据直接生成HFile格式,然后直接加载到HBase的表当中去

 

需求:将我们hdfs上面的这个路径/hbase/input/user.txt的数据文件,转换成HFile格式,然后load到myuser2这张表里面去

1、开发MR程序

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class BulkData extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.quorum","node01:2181,node02:2181,node03:2181");
        System.exit(ToolRunner.run(conf,new BulkLoadData(),args));
    }

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf());
        Connection connection = ConnectionFactory.createConnection(super.getConf());
        Table connectionTable = connection.getTable(TableName.valueOf("myuser2"));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser2"));
        job.setJarByClass(BulkLoadData.class);
        job.setMapperClass(BulkLoadMap.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        HFileOutputFormat2.configureIncrementalLoad(job,connectionTable,regionLocator);
        FileInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input/"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://node01:8020/hbase/output_hfile"));
        return job.waitForCompletion(true)?0:1;

    }

    public static class BulkLoadMap extends Mapper<LongWritable,Text,ImmutableBytesWritable,Put> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String[] split = value.toString().split("\t");
            Put put = new Put(Bytes.toBytes(split[0]));
            put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
            put.addColumn("f1".getBytes(),"age".getBytes(),Bytes.toBytes(Integer.valueOf(split[2])));
            context.write(new ImmutableBytesWritable(Bytes.toBytes(split[0])),put);
        }

    }

}

2、将代码打成jar包然后进行运行

yarn jar original-hbaseStudy-1.0-SNAPSHOT.jar  cn.itcast.hbasemr.HBaseLoad

3、开发代码,加载数据

将输出路径下面的HFile文件,加载到我们的hbase表当中去

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;

public class LoadData {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set("hbase.zookeeper.property.clientPort","2181");
        conf.set("hbase.zookeeper.quorum","node01,node02,node03");
        Connection connection = ConnectionFactory.createConnection(conf);
        Admin admin = connection.getAdmin();
        Table connectionTable = connection.getTable(TableName.valueOf("myuser"));
        RegionLocator regionLocator = connection.getRegionLocator(TableName.valueOf("myuser"));
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf);
        load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"),admin,connectionTable,regionLocator);

    }
}

或者我们也可以通过命令行来进行加载数据

先将hbase的jar包添加到hadoop的classpath路径下

export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0/

export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0/

export HADOOP_CLASSPATH=`${HBASE_HOME}/bin/hbase mapredcp`

然后执行以下命令,将hbase的HFile直接导入到表myuser2当中来

yarn jar /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-server-1.2.0-cdh5.14.0.jar completebulkload /hbase/output_hfile myuser2

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!