Interpreting output from mahout clusterdumper

前端未结

关注

 4  2075

面向向阳花 2021-02-10 03:24

I ran a clustering test on crawled pages (more than 25K docs ; personal data set). I\'ve done a clusterdump :

$MAHOUT_HOME/bin/mahout clusterdump --seqFileDir ou


      
      
        
          4条回答        

        
                    
            
            
                         
                
              
              
                
                   悲&欢浪女
                                             
                
                
                (楼主)
            
              
              
                2021-02-10 04:14
              

            
            
                        

By default, kmeans clustering uses WeightedVector which does not include the data point name. So, you would like to make a sequence file yourself using NamedVector. There is a one to one correspondence between the number of seq files and the mapping tasks. So if your mapping capacity is 12, you want to chop your data into 12 pieces when making seqfiles
NamedVecotr:

vector = new NamedVector(new SequentialAccessSparseVector(Cardinality),arrField[0]);

Basically you need to download the clusteredPoints from your HDFS system and write your own code to output the results. Here is the code that I wrote to output the cluster point membership. 

import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.NamedVector;

public class ClusterOutput {

/**
 * @param args
 */
public static void main(String[] args) {
        // TODO Auto-generated method stub
        try {
                BufferedWriter bw;
                Configuration conf = new Configuration();
                FileSystem fs = FileSystem.get(conf);
                File pointsFolder = new File(args[0]);
                File files[] = pointsFolder.listFiles();
                bw = new BufferedWriter(new FileWriter(new File(args[1])));
                HashMap clusterIds;
                clusterIds = new HashMap(5000);
                for(File file:files){
                        if(file.getName().indexOf("part-m")<0)
                                continue;
                        SequenceFile.Reader reader = new SequenceFile.Reader(fs,  new Path(file.getAbsolutePath()), conf);
                        IntWritable key = new IntWritable();
                        WeightedVectorWritable value = new WeightedVectorWritable();
                        while (reader.next(key, value)) {
                                NamedVector vector = (NamedVector) value.getVector();
                                String vectorName = vector.getName();
                                bw.write(vectorName + "\t" + key.toString()+"\n");
                                if(clusterIds.containsKey(key.toString())){
                                        clusterIds.put(key.toString(), clusterIds.get(key.toString())+1);
                                }
                                else
                                        clusterIds.put(key.toString(), 1);
                        }
                        bw.flush();
                        reader.close(); 
                }
                bw.flush();
                bw.close();
                bw = new BufferedWriter(new FileWriter(new File(args[2])));
                Set keys=clusterIds.keySet();
                for(String key:keys){
                        bw.write(key+" "+clusterIds.get(key)+"\n");
                }
                bw.flush();
                bw.close();
                } catch (IOException e) {
                        e.printStackTrace();
                }
        }
}


    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它4个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复