How to UPSERT (MERGE, INSERT … ON DUPLICATE UPDATE) in PostgreSQL?

前端 未结 6 1280
灰色年华
灰色年华 2020-11-21 05:42

A very frequently asked question here is how to do an upsert, which is what MySQL calls INSERT ... ON DUPLICATE UPDATE and the standard supports as part of the

6条回答
  •  滥情空心
    2020-11-21 05:56

    Since this question was closed, I'm posting here for how you do it using SQLAlchemy. Via recursion, it retries a bulk insert or update to combat race conditions and validation errors.

    First the imports

    import itertools as it
    
    from functools import partial
    from operator import itemgetter
    
    from sqlalchemy.exc import IntegrityError
    from app import session
    from models import Posts
    

    Now a couple helper functions

    def chunk(content, chunksize=None):
        """Groups data into chunks each with (at most) `chunksize` items.
        https://stackoverflow.com/a/22919323/408556
        """
        if chunksize:
            i = iter(content)
            generator = (list(it.islice(i, chunksize)) for _ in it.count())
        else:
            generator = iter([content])
    
        return it.takewhile(bool, generator)
    
    
    def gen_resources(records):
        """Yields a dictionary if the record's id already exists, a row object 
        otherwise.
        """
        ids = {item[0] for item in session.query(Posts.id)}
    
        for record in records:
            is_row = hasattr(record, 'to_dict')
    
            if is_row and record.id in ids:
                # It's a row but the id already exists, so we need to convert it 
                # to a dict that updates the existing record. Since it is duplicate,
                # also yield True
                yield record.to_dict(), True
            elif is_row:
                # It's a row and the id doesn't exist, so no conversion needed. 
                # Since it's not a duplicate, also yield False
                yield record, False
            elif record['id'] in ids:
                # It's a dict and the id already exists, so no conversion needed. 
                # Since it is duplicate, also yield True
                yield record, True
            else:
                # It's a dict and the id doesn't exist, so we need to convert it. 
                # Since it's not a duplicate, also yield False
                yield Posts(**record), False
    

    And finally the upsert function

    def upsert(data, chunksize=None):
        for records in chunk(data, chunksize):
            resources = gen_resources(records)
            sorted_resources = sorted(resources, key=itemgetter(1))
    
            for dupe, group in it.groupby(sorted_resources, itemgetter(1)):
                items = [g[0] for g in group]
    
                if dupe:
                    _upsert = partial(session.bulk_update_mappings, Posts)
                else:
                    _upsert = session.add_all
    
                try:
                    _upsert(items)
                    session.commit()
                except IntegrityError:
                    # A record was added or deleted after we checked, so retry
                    # 
                    # modify accordingly by adding additional exceptions, e.g.,
                    # except (IntegrityError, ValidationError, ValueError)
                    db.session.rollback()
                    upsert(items)
                except Exception as e:
                    # Some other error occurred so reduce chunksize to isolate the 
                    # offending row(s)
                    db.session.rollback()
                    num_items = len(items)
    
                    if num_items > 1:
                        upsert(items, num_items // 2)
                    else:
                        print('Error adding record {}'.format(items[0]))
    

    Here's how you use it

    >>> data = [
    ...     {'id': 1, 'text': 'updated post1'}, 
    ...     {'id': 5, 'text': 'updated post5'}, 
    ...     {'id': 1000, 'text': 'new post1000'}]
    ... 
    >>> upsert(data)
    

    The advantage this has over bulk_save_objects is that it can handle relationships, error checking, etc on insert (unlike bulk operations).

提交回复
热议问题