Map源码-Split
以下代码为了思路通顺进行过拆分整理 ---- 重在理解
//提交任务,等待任务完成并返回任务状态
job.waitForCompletion(true);
//判断当前的状态
if (state == JobState.DEFINE) {
//提交任务
submit();
}
//监控任务的运行状态
if (verbose) {
monitorAndPrintJob();
}
//返回任务状态
return isSuccessful();
//-----------------------------submit();
//确认当前任务的状态
ensureState(JobState.DEFINE);
//mapreduce1.x和2.x,但是2的时候将1的好多方法进行了优化
setUseNewAPI();
//获取当前任务所运行的集群
connect();
//Provides a way to access information about the map/reduce cluster.
cluster = new Cluster(getConfiguration());
//创建Job的提交器
final JobSubmitter submitter =
getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
//提交任务到系统去执行
//Internal method for submitting jobs to the system
status = submitter.submitJobInternal(Job.this, cluster)
//任务的状态修改为运行
state = JobState.RUNNING;
//-----------------------------------submitter.submitJobInternal
//验证job输出
checkSpecs(job);
//生成并设置新的JobId
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
//获取任务的提交目录
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
// Create the splits for the job
int maps = writeSplits(job, submitJobDir);
//设置map的数量,其中map的数量就等于切片的数量
conf.setInt(MRJobConfig.NUM_MAPS, maps);
// ------------------------------Create the splits for the job
int maps = writeSplits(job, submitJobDir);
//使用新的代码去获取切片
maps = writeNewSplits(job, jobSubmitDir);
//通过反射工具类创建新的实例对象
//Create an object for the given class and initialize it from conf
//其实反射最终创建的对象是 org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class
InputFormat<?, ?> input =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
//获取输入流的格式化类
job.getInputFormatClass()
//如果能获取到对应映射的class就返回,如果没有返回默认的
return (Class<? extends InputFormat<?,?>>)
conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
//获取切片
//Generate the list of files and make them into FileSplits.
List<InputSplit> splits = input.getSplits(job);
//开始计算两个变量(一个切片最少有一个字节,一个最小切片值也是1)
//1
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
//Long.MAX_VALUE
long maxSize = getMaxSplitSize(job);
//创建一个List存放切片
List<InputSplit> splits = new ArrayList<InputSplit>();
//获取本次计算中所有的要计算的文件
List<FileStatus> files = listStatus(job);
//首先取出一个文件
for (FileStatus file: files) {
//获取文件路径
Path path = file.getPath();
//获取文件大小
long length = file.getLen();
//获取文件块的信息
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
//获取单个块的大小
long blockSize = file.getBlockSize();
//如果切片小于blocksize-->将maxsize小于blocksize
//如果切片大于blocksize-->将minsize大于blocksize
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
//默认值就是blocksize
return Math.max(minSize, Math.min(maxSize, blockSize));
//定义一个变量,默认赋值为文件的大小
long bytesRemaining = length;
//如果剩余的数量的大于切片的数量,继续分割切片
//SPLIT_SLOP=1.1 (针对于最后一块,如果最后一块的大小小于10%,直接和倒数第二块切割到一起)
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
//计算split的数据所在的block的索引
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
//封装切片对象并将其存放到list中
//makeSplit(路径,偏移量,切片大小,块的位置,备份的位置);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
//重新计算剩余数据
bytesRemaining -= splitSize;
}
//如果最后一块(0.1--1,1],将剩余数据切割到一起
if (bytesRemaining != 0) {
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));
}
}
//设置文件的数量
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
//返回数据
return splits;
//将链表转为数组
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
// sort the splits into order based on size, so that the biggest go first
Arrays.sort(array, new SplitComparator());
//返回切片的长度
return array.length;
Map源码-MapTask
//-----------------------------------org.apache.hadoop.mapred.MapTask
//判断是否为Map任务
if (isMapTask()) {
//如果没有reduce,不行数据排序
if (conf.getNumReduceTasks() == 0)
}
//使用新的api去处理数据
boolean useNewApi = job.getUseNewMapper();
//初始化信息
initialize(job, getJobID(), reporter, useNewApi);
// check if it is a cleanupJobTask
//使用新API处理问题
runNewMapper(job, splitMetaInfo, umbilical, reporter);
//make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
// make a mapper -------> com.xxxxx.gy.WordCountMapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
(org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format
//-------> org.apache.hadoop.mapreduce.lib.input.TextInputFormat
//-------> org.apache.hadoop.mapreduce.lib.input.FileInputFormat
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
(org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
// 获取当前切片对象:切片的位置,切片的偏移量
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),splitIndex.getStartOffset());
//创建一个记录读取器
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
new NewTrackingRecordReader<INKEY,INVALUE>(split, inputFormat, reporter, taskContext);
//org.apache.hadoop.mapreduce.RecordReader
//org.apache.hadoop.mapreduce.lib.input.LineRecordReader
this.real = inputFormat.createRecordReader(split, taskContext);
return new LineRecordReader(recordDelimiterBytes);
//创建一个记录写出器
org.apache.hadoop.mapreduce.RecordWriter output = null;
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
//MapContext当前map的上下文对象,将刚才创建的对象都整合到一起
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE>
mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(),
input, output,
committer,
reporter, split);
//MapContext的包装类
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
mapContext);
//初始化TextInputFormat
input.initialize(split, mapperContext);
//获取切片
FileSplit split = (FileSplit) genericSplit;
//获取一行的最大长度
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
//获取开始位置(偏移量)
start = split.getStart();
//获取结束位置(偏移量)
end = start + split.getLength();
//获取文件的路径
final Path file = split.getPath();
//获取文件的输入流
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
//设置文件读取的偏移量
fileIn.seek(start);
in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
//如果是第一个切片,start=0,其他的都不等于0
//从第二个切片开始
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
//开始执行Mapper操作
mapper.run(mapperContext);
//判断是否有下一组键值对
//其实调用的是lineRecordReader的reader.nextKeyValue()
while (context.nextKeyValue()) {
//key存放的是当前行偏移量
//value存放的是本行的数据
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
//将来如果需要再开始或者结尾的时候处理某些数据
setup(context);
cleanup(context);
//----------------reader.nextKeyValue()
//如果key为空,创建一个LongWritable对象
if (key == null) {
key = new LongWritable();
}
//设置偏移量--初始值0
key.set(pos);
//创建value对象,里面的值为空,将来用于存放一行数据
if (value == null) {
value = new Text();
}
//每次多读取一行的数据
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
//如果是第一次读取数据
if (pos == 0) {
//需要跳过字符编码标识位,然后读取一行
newSize = skipUtfByteOrderMark();
} else {
//maxLineLength-->一行最多读取的数据(Integer.Max_value)
//newSize-->本次读取的数量(一行数据的长度)
//Read one line from the InputStream into the given Text
newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
//重新定义偏移量
pos += newSize;
}
//说明一行读取完毕
if ((newSize == 0) || (newSize < maxLineLength)) {
break;
}
}
来源:CSDN
作者:BF-LoneSilverWind
链接:https://blog.csdn.net/digua930126/article/details/103638169