Which commit has this blob?

前端 未结 7 2084
别那么骄傲
别那么骄傲 2020-11-22 02:45

Given the hash of a blob, is there a way to get a list of commits that have this blob in their tree?

7条回答
  •  名媛妹妹
    2020-11-22 03:13

    So... I needed to find all files over a given limit in a repo over 8GB in size, with over 108,000 revisions. I adapted Aristotle's perl script along with a ruby script I wrote to reach this complete solution.

    First, git gc - do this to ensure all objects are in packfiles - we don't scan objects not in pack files.

    Next Run this script to locate all blobs over CUTOFF_SIZE bytes. Capture output to a file like "large-blobs.log"

    #!/usr/bin/env ruby
    
    require 'log4r'
    
    # The output of git verify-pack -v is:
    # SHA1 type size size-in-packfile offset-in-packfile depth base-SHA1
    #
    #
    GIT_PACKS_RELATIVE_PATH=File.join('.git', 'objects', 'pack', '*.pack')
    
    # 10MB cutoff
    CUTOFF_SIZE=1024*1024*10
    #CUTOFF_SIZE=1024
    
    begin
    
      include Log4r
      log = Logger.new 'git-find-large-objects'
      log.level = INFO
      log.outputters = Outputter.stdout
    
      git_dir = %x[ git rev-parse --show-toplevel ].chomp
    
      if git_dir.empty?
        log.fatal "ERROR: must be run in a git repository"
        exit 1
      end
    
      log.debug "Git Dir: '#{git_dir}'"
    
      pack_files = Dir[File.join(git_dir, GIT_PACKS_RELATIVE_PATH)]
      log.debug "Git Packs: #{pack_files.to_s}"
    
      # For details on this IO, see http://stackoverflow.com/questions/1154846/continuously-read-from-stdout-of-external-process-in-ruby
      #
      # Short version is, git verify-pack flushes buffers only on line endings, so
      # this works, if it didn't, then we could get partial lines and be sad.
    
      types = {
        :blob => 1,
        :tree => 1,
        :commit => 1,
      }
    
    
      total_count = 0
      counted_objects = 0
      large_objects = []
    
      IO.popen("git verify-pack -v -- #{pack_files.join(" ")}") do |pipe|
        pipe.each do |line|
          # The output of git verify-pack -v is:
          # SHA1 type size size-in-packfile offset-in-packfile depth base-SHA1
          data = line.chomp.split(' ')
          # types are blob, tree, or commit
          # we ignore other lines by looking for that
          next unless types[data[1].to_sym] == 1
          log.info "INPUT_THREAD: Processing object #{data[0]} type #{data[1]} size #{data[2]}"
          hash = {
            :sha1 => data[0],
            :type => data[1],
            :size => data[2].to_i,
          }
          total_count += hash[:size]
          counted_objects += 1
          if hash[:size] > CUTOFF_SIZE
            large_objects.push hash
          end
        end
      end
    
      log.info "Input complete"
    
      log.info "Counted #{counted_objects} totalling #{total_count} bytes."
    
      log.info "Sorting"
    
      large_objects.sort! { |a,b| b[:size] <=> a[:size] }
    
      log.info "Sorting complete"
    
      large_objects.each do |obj|
        log.info "#{obj[:sha1]} #{obj[:type]} #{obj[:size]}"
      end
    
      exit 0
    end
    

    Next, edit the file to remove any blobs you don't wait and the INPUT_THREAD bits at the top. once you have only lines for the sha1s you want to find, run the following script like this:

    cat edited-large-files.log | cut -d' ' -f4 | xargs git-find-blob | tee large-file-paths.log
    

    Where the git-find-blob script is below.

    #!/usr/bin/perl
    
    # taken from: http://stackoverflow.com/questions/223678/which-commit-has-this-blob
    # and modified by Carl Myers  to scan multiple blobs at once
    # Also, modified to keep the discovered filenames
    # vi: ft=perl
    
    use 5.008;
    use strict;
    use Memoize;
    use Data::Dumper;
    
    
    my $BLOBS = {};
    
    MAIN: {
    
        memoize 'check_tree';
    
        die "usage: git-find-blob   ... -- []\n"
            if not @ARGV;
    
    
        while ( @ARGV && $ARGV[0] ne '--' ) {
            my $arg = $ARGV[0];
            #print "Processing argument $arg\n";
            open my $rev_parse, '-|', git => 'rev-parse' => '--verify', $arg or die "Couldn't open pipe to git-rev-parse: $!\n";
            my $obj_name = <$rev_parse>;
            close $rev_parse or die "Couldn't expand passed blob.\n";
            chomp $obj_name;
            #$obj_name eq $ARGV[0] or print "($ARGV[0] expands to $obj_name)\n";
            print "($arg expands to $obj_name)\n";
            $BLOBS->{$obj_name} = $arg;
            shift @ARGV;
        }
        shift @ARGV; # drop the -- if present
    
        #print "BLOBS: " . Dumper($BLOBS) . "\n";
    
        foreach my $blob ( keys %{$BLOBS} ) {
            #print "Printing results for blob $blob:\n";
    
            open my $log, '-|', git => log => @ARGV, '--pretty=format:%T %h %s'
                or die "Couldn't open pipe to git-log: $!\n";
    
            while ( <$log> ) {
                chomp;
                my ( $tree, $commit, $subject ) = split " ", $_, 3;
                #print "Checking tree $tree\n";
                my $results = check_tree( $tree );
    
                #print "RESULTS: " . Dumper($results);
                if (%{$results}) {
                    print "$commit $subject\n";
                    foreach my $blob ( keys %{$results} ) {
                        print "\t" . (join ", ", @{$results->{$blob}}) . "\n";
                    }
                }
            }
        }
    
    }
    
    
    sub check_tree {
        my ( $tree ) = @_;
        #print "Calculating hits for tree $tree\n";
    
        my @subtree;
    
        # results = { BLOB => [ FILENAME1 ] }
        my $results = {};
        {
            open my $ls_tree, '-|', git => 'ls-tree' => $tree
                or die "Couldn't open pipe to git-ls-tree: $!\n";
    
            # example git ls-tree output:
            # 100644 blob 15d408e386400ee58e8695417fbe0f858f3ed424    filaname.txt
            while ( <$ls_tree> ) {
                /\A[0-7]{6} (\S+) (\S+)\s+(.*)/
                    or die "unexpected git-ls-tree output";
                #print "Scanning line '$_' tree $2 file $3\n";
                foreach my $blob ( keys %{$BLOBS} ) {
                    if ( $2 eq $blob ) {
                        print "Found $blob in $tree:$3\n";
                        push @{$results->{$blob}}, $3;
                    }
                }
                push @subtree, [$2, $3] if $1 eq 'tree';
            }
        }
    
        foreach my $st ( @subtree ) {
            # $st->[0] is tree, $st->[1] is dirname
            my $st_result = check_tree( $st->[0] );
            foreach my $blob ( keys %{$st_result} ) {
                foreach my $filename ( @{$st_result->{$blob}} ) {
                    my $path = $st->[1] . '/' . $filename;
                    #print "Generating subdir path $path\n";
                    push @{$results->{$blob}}, $path;
                }
            }
        }
    
        #print "Returning results for tree $tree: " . Dumper($results) . "\n\n";
        return $results;
    }
    

    The output will look like this:

     
        path/to/file.txt
        path/to/file2.txt
        ...
     
    

    And so on. Every commit which contains a large file in its tree will be listed. if you grep out the lines that start with a tab, and uniq that, you will have a list of all paths you can filter-branch to remove, or you can do something more complicated.

    Let me reiterate: this process ran successfully, on a 10GB repo with 108,000 commits. It took much longer than I predicted when running on a large number of blobs though, over 10 hours, I will have to see if the memorize bit is working...

提交回复
热议问题