Sharing data between master and reduce

前端 未结 2 451
自闭症患者
自闭症患者 2021-01-24 18:42

I need to perform aggregation using the results form all the reduce tasks. Basically the reduce task finds the sum and count and a value. I need to add all the sums and counts a

相关标签:
2条回答
  • 2021-01-24 19:10

    Found the solution. I used counters

    http://diveintodata.org/2011/03/15/an-example-of-hadoop-mapreduce-counter/

    public class FlightData {

    //enum for counters used by reducers
    public static enum FlightCounters {
        FLIGHT_COUNT,
        FLIGHT_DELAY;
    }
    public static class MyReducer 
    extends Reducer<Text, Text,Text,IntWritable> {
    
        public void reduce(Text key, Iterable<Text> values, 
                Context context
                ) throws IOException, InterruptedException {
    
    
            delay1 = Float.parseFloat(origin[5]);
            delay2 = Float.parseFloat(dest[5]);
            context.getCounter(FlightCounters.FLIGHT_COUNT).increment(1);
            context.getCounter(FlightCounters.FLIGHT_DELAY)
            .increment((long) (delay1 + delay2));
    
        }
    }
    public static void main(String[] args) throws Exception{
        float flightCount, flightDelay;
        job.waitForCompletion(true);
        //get the final results updated in counter by all map and reduce tasks
        flightCount = job.getCounters()
                .findCounter(FlightCounters.FLIGHT_COUNT).getValue();
        flightDelay = job.getCounters()
                .findCounter(FlightCounters.FLIGHT_DELAY).getValue();
    }
    

    }

    0 讨论(0)
  • 2021-01-24 19:11

    Override the run() of the mapper and reducer using the new org.apache.hadoop.mapreduce API. In these methods you can emit the accumulated sum/count from each mapper or reducer.

    Also you would need to limit the reducer count by 1 so as to get a global sum of all the sums generated by multiple mappers.

    See the below code for more clarity:

    import java.io.IOException;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    
    public class AggregationExample extends Configured implements Tool {
    
        /**
         * This is Mapper.
         * 
         */
        public static class MapJob extends Mapper<LongWritable, Text, Text, Text> {
    
            private Text outputKey = new Text();
            private Text outputValue = new Text();
            private double sum;
    
            @Override
            public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    
                try {
                    // say that you need to sum up the value part
                    sum+= Double.valueOf(value);
            }
    
            @Override
            public void run(Context context) throws IOException, InterruptedException {
    
                setup(context);
                while (context.nextKeyValue()) {
                    map(context.getCurrentKey(), context.getCurrentValue(), context);
                }
    
                // emit out the sum per mapper
                outputKey.set(sum);
                context.write(outputKey, outputValue);// Notice that the outputValue is empty
                cleanup(context);
    
            }
        }
    
        /**
         * This is Reducer.
         * 
         */
        public static class ReduceJob extends Reducer<Text, Text, Text, Text> {
    
            private Text outputKey = new Text();
            private Text outputValue = new Text();
            private double sum;
    
            @Override
            protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException,
                    InterruptedException {
    
    
                // summation of values from each mapper
                sum += Double.valueOf(key.toString());
    
            }
    
            @Override
            public void run(Context context) throws IOException, InterruptedException {
    
                setup(context);
                while (context.nextKey()) {
                    reduce(context.getCurrentKey(), context.getValues(), context);
                }
    
                // emit out the global sums
                outputKey.set(sum);
                context.write(outputKey, outputValue);
                cleanup(context);
            }
        }
    
        @Override
        public int run(String[] args) throws Exception {
    
            try {
                Configuration conf = getConf();
    
                // output key and value separator is empty as in final output only
                // key is emitted and value is empty
                conf.set("mapred.textoutputformat.separator", "");
    
                // Configuring mapred to have just one reducer as we need to find
                // single sum values from all the inputs
                conf.setInt("mapred.tasktracker.reduce.tasks.maximum", 1);
                conf.setInt("mapred.reduce.tasks", 1);
    
                Job job = new Job(conf);
    
                job.setJarByClass(AggregationExample.class);
                job.setJobName("Aggregation Example");
    
                job.setMapperClass(MapJob.class);
                job.setReducerClass(ReduceJob.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(Text.class);
    
                job.setInputFormatClass(TextInputFormat.class);
                job.setOutputFormatClass(TextOutputFormat.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
                FileInputFormat.setInputPaths(job, args[0]);
                FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
                boolean success = job.waitForCompletion(true);
                return success ? 0 : 1;
            } catch (Exception e) {
                e.printStackTrace();
                return 1;
            }
    
        }
    
        public static void main(String[] args) throws Exception {
    
            if (args.length < 2) {
                System.out
                        .println("Usage: AggregationExample <comma sparated list of input directories> <output dir>");
                System.exit(-1);
            }
    
            int result = ToolRunner.run(new AggregationExample(), args);
            System.exit(result);
        }
    
    }
    

    You may very well map this approach to your problem.

    0 讨论(0)
提交回复
热议问题