2.3 MapReduce源码分析01

夙愿已清 提交于 2020-01-27 08:13:16

Map源码-Split


以下代码为了思路通顺进行过拆分整理 ---- 重在理解
//提交任务,等待任务完成并返回任务状态
job.waitForCompletion(true);
//判断当前的状态
if (state == JobState.DEFINE) {
       //提交任务
    submit();
  }
//监控任务的运行状态
if (verbose) {
     monitorAndPrintJob();
  }
//返回任务状态
return isSuccessful();

//-----------------------------submit();
//确认当前任务的状态
ensureState(JobState.DEFINE);
//mapreduce1.x和2.x,但是2的时候将1的好多方法进行了优化
setUseNewAPI();
//获取当前任务所运行的集群
connect();
//Provides a way to access information about the map/reduce cluster.
cluster =  new Cluster(getConfiguration());
//创建Job的提交器
final JobSubmitter submitter = 
       getJobSubmitter(cluster.getFileSystem(), cluster.getClient());
//提交任务到系统去执行 
//Internal method for submitting jobs to the system
status = submitter.submitJobInternal(Job.this, cluster)
//任务的状态修改为运行
state = JobState.RUNNING;
//-----------------------------------submitter.submitJobInternal
//验证job输出
checkSpecs(job);
//生成并设置新的JobId
JobID jobId = submitClient.getNewJobID();
job.setJobID(jobId);
//获取任务的提交目录
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
// Create the splits for the job
int maps = writeSplits(job, submitJobDir);
//设置map的数量,其中map的数量就等于切片的数量
conf.setInt(MRJobConfig.NUM_MAPS, maps);

// ------------------------------Create the splits for the job
int maps = writeSplits(job, submitJobDir);
//使用新的代码去获取切片
maps = writeNewSplits(job, jobSubmitDir);
//通过反射工具类创建新的实例对象
//Create an object for the given class and initialize it from conf
//其实反射最终创建的对象是 org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class
InputFormat<?, ?> input =
    ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
//获取输入流的格式化类
job.getInputFormatClass()
               //如果能获取到对应映射的class就返回,如果没有返回默认的
               return (Class<? extends InputFormat<?,?>>) 
    conf.getClass(INPUT_FORMAT_CLASS_ATTR, TextInputFormat.class);
//获取切片
//Generate the list of files and make them into FileSplits.
List<InputSplit> splits = input.getSplits(job);
//开始计算两个变量(一个切片最少有一个字节,一个最小切片值也是1)
//1
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
//Long.MAX_VALUE
  long maxSize = getMaxSplitSize(job);
//创建一个List存放切片
List<InputSplit> splits = new ArrayList<InputSplit>();
//获取本次计算中所有的要计算的文件
List<FileStatus> files = listStatus(job);
//首先取出一个文件
for (FileStatus file: files) {
               //获取文件路径
               Path path = file.getPath();
               //获取文件大小
               long length = file.getLen();
               //获取文件块的信息
               BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
               //获取单个块的大小
               long blockSize = file.getBlockSize();
               //如果切片小于blocksize-->将maxsize小于blocksize
               //如果切片大于blocksize-->将minsize大于blocksize
               long splitSize = computeSplitSize(blockSize, minSize, maxSize);
              //默认值就是blocksize
              return Math.max(minSize, Math.min(maxSize, blockSize));
               //定义一个变量,默认赋值为文件的大小
               long bytesRemaining = length;
               //如果剩余的数量的大于切片的数量,继续分割切片
               //SPLIT_SLOP=1.1 (针对于最后一块,如果最后一块的大小小于10%,直接和倒数第二块切割到一起)
               while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
                   //计算split的数据所在的block的索引
              int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
                   //封装切片对象并将其存放到list中
                   //makeSplit(路径,偏移量,切片大小,块的位置,备份的位置);
                   splits.add(makeSplit(path, length-bytesRemaining, splitSize,
                       blkLocations[blkIndex].getHosts(),
                       blkLocations[blkIndex].getCachedHosts()));
                   //重新计算剩余数据
                   bytesRemaining -= splitSize;
              }
               //如果最后一块(0.1--1,1],将剩余数据切割到一起
               if (bytesRemaining != 0) {
                   splits.add(makeSplit(path, length-bytesRemaining, splitSize,
                       blkLocations[blkIndex].getHosts(),
                       blkLocations[blkIndex].getCachedHosts()));
              }
          }
//设置文件的数量
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
//返回数据
return splits;
//将链表转为数组
T[] array = (T[]) splits.toArray(new InputSplit[splits.size()]);
// sort the splits into order based on size, so that the biggest go first
Arrays.sort(array, new SplitComparator());
//返回切片的长度
return array.length;



Map源码-MapTask


//-----------------------------------org.apache.hadoop.mapred.MapTask
//判断是否为Map任务
if (isMapTask()) {
   //如果没有reduce,不行数据排序
   if (conf.getNumReduceTasks() == 0) 
}
//使用新的api去处理数据
boolean useNewApi = job.getUseNewMapper();
//初始化信息
initialize(job, getJobID(), reporter, useNewApi);
// check if it is a cleanupJobTask
//使用新API处理问题
runNewMapper(job, splitMetaInfo, umbilical, reporter);
//make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
                 new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                             getTaskID(),
                                                                             reporter);
// make a mapper -------> com.xxxxx.gy.WordCountMapper
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
      (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
// make the input format

//-------> org.apache.hadoop.mapreduce.lib.input.TextInputFormat
//-------> org.apache.hadoop.mapreduce.lib.input.FileInputFormat
org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
  (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
  ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
// rebuild the input split
// 获取当前切片对象:切片的位置,切片的偏移量
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),splitIndex.getStartOffset());

//创建一个记录读取器
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
            new NewTrackingRecordReader<INKEY,INVALUE>(split, inputFormat, reporter, taskContext);
//org.apache.hadoop.mapreduce.RecordReader
//org.apache.hadoop.mapreduce.lib.input.LineRecordReader
this.real = inputFormat.createRecordReader(split, taskContext);
return new LineRecordReader(recordDelimiterBytes);
//创建一个记录写出器
org.apache.hadoop.mapreduce.RecordWriter output = null;
output = new NewOutputCollector(taskContext, job, umbilical, reporter);

//MapContext当前map的上下文对象,将刚才创建的对象都整合到一起
org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
   mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
         input, output, 
         committer, 
         reporter, split);
//MapContext的包装类
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
       mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
             mapContext);

//初始化TextInputFormat
input.initialize(split, mapperContext);
//获取切片
FileSplit split = (FileSplit) genericSplit;
//获取一行的最大长度
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
//获取开始位置(偏移量)
start = split.getStart();
//获取结束位置(偏移量)
end = start + split.getLength();
//获取文件的路径
final Path file = split.getPath();
//获取文件的输入流
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
//设置文件读取的偏移量
fileIn.seek(start);
   in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength());
   filePosition = fileIn;

//如果是第一个切片,start=0,其他的都不等于0
//从第二个切片开始
if (start != 0) {
     start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
   this.pos = start;
//开始执行Mapper操作
mapper.run(mapperContext);
//判断是否有下一组键值对
//其实调用的是lineRecordReader的reader.nextKeyValue()
while (context.nextKeyValue()) {
       //key存放的是当前行偏移量
       //value存放的是本行的数据
       map(context.getCurrentKey(), context.getCurrentValue(), context);
  }
//将来如果需要再开始或者结尾的时候处理某些数据
setup(context);
cleanup(context);

//----------------reader.nextKeyValue()
//如果key为空,创建一个LongWritable对象
if (key == null) {
   key = new LongWritable();
}
//设置偏移量--初始值0
key.set(pos);
//创建value对象,里面的值为空,将来用于存放一行数据
if (value == null) {
   value = new Text();
}
//每次多读取一行的数据
while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
   //如果是第一次读取数据
   if (pos == 0) {
       //需要跳过字符编码标识位,然后读取一行
       newSize = skipUtfByteOrderMark();
  } else {
       //maxLineLength-->一行最多读取的数据(Integer.Max_value)
       //newSize-->本次读取的数量(一行数据的长度)
       //Read one line from the InputStream into the given Text
       newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
       //重新定义偏移量
       pos += newSize;
  }
   //说明一行读取完毕
   if ((newSize == 0) || (newSize < maxLineLength)) {
       break;
  }
}
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!