Rails: Faster way to perform updates on many records

后端 未结 3 2059
执笔经年
执笔经年 2021-02-09 15:54

In our Rails 3.2.13 app (Ruby 2.0.0 + Postgres on Heroku), we are often retreiving a large amount of Order data from an API, and then we need to update or create each order in o

相关标签:
3条回答
  • 2021-02-09 16:36

    You can monkey-patch ActiveRecord like this:

    class ActiveRecord::Base
    
      #http://stackoverflow.com/questions/15317837/bulk-insert-records-into-active-record-table?lq=1
      #https://gist.github.com/jackrg/76ade1724bd816292e4e
      #  "UPDATE THIS SET <list_of_column_assignments>  FROM <table_name> THIS  JOIN (VALUES (<csv1>, <csv2>,...) VALS ( <column_names> ) ON <list_of_primary_keys_comparison>"
      def self.bulk_update(record_list)
          pk = self.primary_key
          raise "primary_key not found" unless pk.present?
    
          raise "record_list not an Array of Hashes" unless record_list.is_a?(Array) && record_list.all? {|rec| rec.is_a? Hash }
          return nil if record_list.empty?
    
          result = nil
    
          #test if every hash has primary keys, so we can JOIN
          record_list.each { |r|  raise "Primary Keys '#{self.primary_key.to_s}' not found on record: #{r}" unless hasAllPKs?(r) }
    
    
          #list of primary keys comparison
          pk_comparison_array = []
          if (pk).is_a?(Array)
              pk.each {|thiskey| pk_comparison_array << "THIS.#{thiskey} = VALS.#{thiskey}" }
          else
              pk_comparison_array << "THIS.#{pk} = VALS.#{pk}"
          end
          pk_comparison = pk_comparison_array.join(' AND ')
    
          #SQL
          (1..record_list.count).step(1000).each do |start|
            key_list, value_list = convert_record_list(record_list[start-1..start+999])
            #csv values
            csv_vals = value_list.map {|v| "(#{v.join(", ")})" }.join(", ")
            #column names
            column_names = key_list.join(", ")
            #list of columns assignments
            columns_assign_array = []
            key_list.each {|col|
              unless inPK?(col)
                columns_assign_array << "THIS.#{col} = VALS.#{col}"
              end }
            columns_assign = columns_assign_array.join(', ')
    
            sql = "UPDATE THIS SET #{columns_assign}  FROM #{self.table_name} THIS  JOIN ( VALUES #{csv_vals} ) VALS ( #{column_names} ) ON ( #{pk_comparison} )"
            result = self.connection.execute(sql)
    
            return result if result<0
          end
    
          return result
    
      end
    
      def self.inPK?(str)
          pk = self.primary_key
    
          test = str.to_s
          if pk.is_a?(Array)
                (pk.include?(test))
          else
                (pk==test)
          end
      end
    
      #test if given hash has primary keys included as hash keys and those keys are not empty
      def self.hasAllPKs?(hash)
          h = hash.stringify_keys
          pk = self.primary_key
    
          if pk.is_a?(Array)
               (pk.all? {|k| h.key?(k) and h[k].present? })
          else
               h.key?(pk) and h[pk].present?
          end
      end
    
      def self.convert_record_list(record_list)
        # Build the list of keys
        key_list = record_list.map(&:keys).flatten.map(&:to_s).uniq.sort
    
        value_list = record_list.map do |rec|
          list = []
          key_list.each {|key| list <<  ActiveRecord::Base.connection.quote(rec[key] || rec[key.to_sym]) }
          list
        end
    
        # If table has standard timestamps and they're not in the record list then add them to the record list
        time = ActiveRecord::Base.connection.quote(Time.now)
        for field_name in %w(created_at updated_at)
          if self.column_names.include?(field_name) && !(key_list.include?(field_name))
            key_list << field_name
            value_list.each {|rec| rec << time }
          end
        end
    
        return [key_list, value_list]
      end
    end
    

    Then, you can generate a array of hashes containing your models attributes (including theirs primary keys) and do something like:

    ActiveRecord::Base.transaction do
       Model.bulk_update [ {attr1: val1, attr2: val2,...},  {attr1: val1, attr2: val2,...},   ... ]
    end
    

    It will be a single SQL command without Rails callbacks and validations.

    0 讨论(0)
  • 2021-02-09 16:37

    For PostgreSQL, there are several issues that the above approach does not address:

    1. You must specify an actual table, not just an alias, in the update target table.
    2. You cannot repeat the target table in the FROM phrase. Since you are joining the target table to a VALUES table (hence there is only one table in the FROM phrase, you won't be able to use JOIN, you must instead use "WHERE ".
    3. You don't get the same "free" casts in a VALUES table that you do in a simple "UPDATE" command, so you must cast date/timestamp values as such (#val_cast does this).

      class ActiveRecord::Base
      
        def self.update!(record_list)
          raise ArgumentError "record_list not an Array of Hashes" unless record_list.is_a?(Array) && record_list.all? {|rec| rec.is_a? Hash }
          return record_list if record_list.empty?
      
          (1..record_list.count).step(1000).each do |start|
            field_list, value_list = convert_record_list(record_list[start-1..start+999])
            key_field = self.primary_key
            non_key_fields = field_list - [%Q["#{self.primary_key}"], %Q["created_at"]]
            columns_assign = non_key_fields.map {|field| "#{field} = #{val_cast(field)}"}.join(",")
            value_table = value_list.map {|row| "(#{row.join(", ")})" }.join(", ")
            sql = "UPDATE #{table_name} AS this SET #{columns_assign} FROM (VALUES #{value_table}) vals (#{field_list.join(", ")}) WHERE this.#{key_field} = vals.#{key_field}"
            self.connection.update_sql(sql)
          end
      
          return record_list
        end
      
        def self.val_cast(field)
          field = field.gsub('"', '')
          if (column = columns.find{|c| c.name == field }).sql_type =~ /time|date/
            "cast (vals.#{field} as #{column.sql_type})"
          else
            "vals.#{field}"
          end
        end
      
        def self.convert_record_list(record_list)
          # Build the list of fields
          field_list = record_list.map(&:keys).flatten.map(&:to_s).uniq.sort
      
          value_list = record_list.map do |rec|
            list = []
            field_list.each {|field| list <<  ActiveRecord::Base.connection.quote(rec[field] || rec[field.to_sym]) }
            list
          end
      
          # If table has standard timestamps and they're not in the record list then add them to the record list
          time = ActiveRecord::Base.connection.quote(Time.now)
          for field_name in %w(created_at updated_at)
            if self.column_names.include?(field_name) && !(field_list.include?(field_name))
              field_list << field_name
              value_list.each {|rec| rec << time }
            end
          end
      
          field_list.map! {|field| %Q["#{field}"] }
      
          return [field_list, value_list]
        end
      end
      
    0 讨论(0)
  • 2021-02-09 16:40

    Try wrapping your entire code into a single database transaction. Since you're on Heroku it'll be a Postgres bottom-end. With that many update statements, you can probably benefit greatly by transacting them all at once, so your code executes quicker and basically just leaves a "queue" of 6500 statements to run on Postgres side as the server is able to dequeue them. Depending on the bottom end, you might have to transact into smaller chunks - but even transacting 100 at a time (and then close and re-open the transaction) would greatly improve throughput into Pg.

    http://api.rubyonrails.org/classes/ActiveRecord/Transactions/ClassMethods.html http://www.postgresql.org/docs/9.2/static/sql-set-transaction.html

    So before line 2 you'd add something like:

    def add_details(shop, shopify_orders)
      Order.transaction do
        shopify_orders.each do |shopify_order|
    

    And then at the very end of your method add another end:

          if !payment_details.blank?
            PaymentDetail.add_details(order, payment_details)
          end
        end //shopify_orders.each..
      end //Order.transaction..
    end //method
    
    0 讨论(0)
提交回复
热议问题