相关文件请自行创建!!!
package com.hadoop.hdfs;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IOUtils;
/**
- 合并小文件至 HDFS
-
*/
public class MergeSmallFilesToHDFS {
private static FileSystem fs = null;
private static FileSystem local = null;public static void main(String[] args) throws IOException,
URISyntaxException {
list();
}/**
- 数据集合并,并上传至HDFS
- throws IOException
throws URISyntaxException
/
public static void list() throws IOException, URISyntaxException {
// 读取hadoop文件系统的配置
Configuration conf = new Configuration();
//文件系统访问接口,注意:hdfs://master:9000修改成自己的HDFS地址
URI uri = new URI("hdfs://master:9000");
//创建FileSystem对象
fs = FileSystem.get(uri, conf);
// 获得本地文件系统
local = FileSystem.getLocal(conf);
//过滤目录下的 svn文件,注意:文件路径E://Hadoop/73/修改成自己的路径
FileStatus[] dirstatus = local.globStatus(new Path("E://Hadoop/73/"),new RegexExcludePathFilter("^.svn$"));
//获取73目录下的所有文件路径
Path[] dirs = FileUtil.stat2Paths(dirstatus);
FSDataOutputStream out = null;
FSDataInputStream in = null;
for (Path dir : dirs) {
//2019-10-31
String fileName = dir.getName().replace("-", "");//文件名称
//只接受日期目录下的.txt文件
FileStatus[] localStatus = local.globStatus(new Path(dir+"/"),new RegexAcceptPathFilter("^.txt$"));
// 获得日期目录下的所有文件
Path[] listedPaths = FileUtil.stat2Paths(localStatus);
//输出路径,注意:hdfs://master:9000/20191031/修改成自己的HDFS目录地址
Path block = new Path("hdfs://master:9000/20191031/"+ fileName + ".txt");
System.out.println("合并后的文件名称:"+fileName+".txt");
// 打开输出流
out = fs.create(block);
for (Path p : listedPaths) {
in = local.open(p);// 打开输入流
IOUtils.copyBytes(in, out, 4096, false); // 复制数据
// 关闭输入流
in.close();
}
if (out != null) {
// 关闭输出流
out.close();
}
}
}
/**
- 过滤 regex 格式的文件
-
*/
public static class RegexExcludePathFilter implements PathFilter {
private final String regex;
public RegexExcludePathFilter(String regex) {
this.regex = regex;
}public boolean accept(Path path) {
boolean flag = path.toString().matches(regex);
return !flag;
}
}
/**
- 接受 regex 格式的文件
-
*/
public static class RegexAcceptPathFilter implements PathFilter {
private final String regex;
public RegexAcceptPathFilter(String regex) {
this.regex = regex;
}@Override
public boolean accept(Path path) {
boolean flag = path.toString().matches(regex);
return flag;
}
}
}
来源:51CTO
作者:wx5da03a3bd2999
链接:https://blog.51cto.com/14572091/2446947