Bulk Insert A Pandas DataFrame Using SQLAlchemy

前端 未结 10 1773
死守一世寂寞
死守一世寂寞 2020-11-28 22:07

I have some rather large pandas DataFrames and I\'d like to use the new bulk SQL mappings to upload them to a Microsoft SQL Server via SQL Alchemy. The pandas.to_sql method,

相关标签:
10条回答
  • 2020-11-28 22:19

    This might have been answered by then, but I found the solution by collating different answers on this site and aligning with SQLAlchemy's doc.

    1. The table needs to already exist in db1; with an index set up with auto_increment on.
    2. The Class Current needs to align with the dataframe imported in the CSV and the table in the db1.

    Hope this helps whoever comes here and wants to mix Panda and SQLAlchemy in a quick way.

    from urllib import quote_plus as urlquote
    import sqlalchemy
    from sqlalchemy import create_engine
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy import Column, Integer, String, Numeric
    from sqlalchemy.orm import sessionmaker
    import pandas as pd
    
    
    # Set up of the engine to connect to the database
    # the urlquote is used for passing the password which might contain special characters such as "/"
    engine = create_engine('mysql://root:%s@localhost/db1' % urlquote('weirdPassword*withsp€cialcharacters'), echo=False)
    conn = engine.connect()
    Base = declarative_base()
    
    #Declaration of the class in order to write into the database. This structure is standard and should align with SQLAlchemy's doc.
    class Current(Base):
        __tablename__ = 'tableName'
    
        id = Column(Integer, primary_key=True)
        Date = Column(String(500))
        Type = Column(String(500))
        Value = Column(Numeric())
    
        def __repr__(self):
            return "(id='%s', Date='%s', Type='%s', Value='%s')" % (self.id, self.Date, self.Type, self.Value)
    
    # Set up of the table in db and the file to import
    fileToRead = 'file.csv'
    tableToWriteTo = 'tableName'
    
    # Panda to create a lovely dataframe
    df_to_be_written = pd.read_csv(fileToRead)
    # The orient='records' is the key of this, it allows to align with the format mentioned in the doc to insert in bulks.
    listToWrite = df_to_be_written.to_dict(orient='records')
    
    metadata = sqlalchemy.schema.MetaData(bind=engine,reflect=True)
    table = sqlalchemy.Table(tableToWriteTo, metadata, autoload=True)
    
    # Open the session
    Session = sessionmaker(bind=engine)
    session = Session()
    
    # Inser the dataframe into the database in one bulk
    conn.execute(table.insert(), listToWrite)
    
    # Commit the changes
    session.commit()
    
    # Close the session
    session.close()
    
    0 讨论(0)
  • 2020-11-28 22:19

    My postgres specific solution below auto-creates the database table using your pandas dataframe, and performs a fast bulk insert using the postgres COPY my_table FROM ...

    import io
    
    import pandas as pd
    from sqlalchemy import create_engine
    
    def write_to_table(df, db_engine, schema, table_name, if_exists='fail'):
        string_data_io = io.StringIO()
        df.to_csv(string_data_io, sep='|', index=False)
        pd_sql_engine = pd.io.sql.pandasSQL_builder(db_engine, schema=schema)
        table = pd.io.sql.SQLTable(table_name, pd_sql_engine, frame=df,
                                   index=False, if_exists=if_exists, schema=schema)
        table.create()
        string_data_io.seek(0)
        string_data_io.readline()  # remove header
        with db_engine.connect() as connection:
            with connection.connection.cursor() as cursor:
                copy_cmd = "COPY %s.%s FROM STDIN HEADER DELIMITER '|' CSV" % (schema, table_name)
                cursor.copy_expert(copy_cmd, string_data_io)
            connection.connection.commit()
    
    0 讨论(0)
  • 2020-11-28 22:19

    for people like me who are trying to implement the aforementioned solutions:

    Pandas 0.24.0 has now to_sql with chunksize and method='multi' option that inserts in bulk...

    0 讨论(0)
  • 2020-11-28 22:20

    For anyone facing this problem and having the destination DB as Redshift, note that Redshift does not implement the full set of Postgres commands, and so some of the answers using either Postgres' COPY FROM or copy_from() will not work. psycopg2.ProgrammingError: syntax error at or near "stdin" error when trying to copy_from redshift

    Solution for speeding up the INSERTs to Redshift is to use a file ingest or Odo.

    Reference:
    About Odo http://odo.pydata.org/en/latest/perf.html
    Odo with Redshift
    https://github.com/blaze/odo/blob/master/docs/source/aws.rst
    Redshift COPY (from S3 file)
    https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html

    0 讨论(0)
  • 2020-11-28 22:24

    I ran into a similar issue with pd.to_sql taking hours to upload data. The below code bulk inserted the same data in a few seconds.

    from sqlalchemy import create_engine
    import psycopg2 as pg
    #load python script that batch loads pandas df to sql
    import cStringIO
    
    address = 'postgresql://<username>:<pswd>@<host>:<port>/<database>'
    engine = create_engine(address)
    connection = engine.raw_connection()
    cursor = connection.cursor()
    
    #df is the dataframe containing an index and the columns "Event" and "Day"
    #create Index column to use as primary key
    df.reset_index(inplace=True)
    df.rename(columns={'index':'Index'}, inplace =True)
    
    #create the table but first drop if it already exists
    command = '''DROP TABLE IF EXISTS localytics_app2;
    CREATE TABLE localytics_app2
    (
    "Index" serial primary key,
    "Event" text,
    "Day" timestamp without time zone,
    );'''
    cursor.execute(command)
    connection.commit()
    
    #stream the data using 'to_csv' and StringIO(); then use sql's 'copy_from' function
    output = cStringIO.StringIO()
    #ignore the index
    df.to_csv(output, sep='\t', header=False, index=False)
    #jump to start of stream
    output.seek(0)
    contents = output.getvalue()
    cur = connection.cursor()
    #null values become ''
    cur.copy_from(output, 'localytics_app2', null="")    
    connection.commit()
    cur.close()
    
    0 讨论(0)
  • 2020-11-28 22:26

    This worked for me to connect to Oracle Database using cx_Oracle and SQLALchemy

    import sqlalchemy
    import cx_Oracle
    from sqlalchemy import create_engine
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy import Column, String
    from sqlalchemy.orm import sessionmaker
    import pandas as pd
    
    # credentials
    username = "username"
    password = "password"
    connectStr = "connection:/string"
    tableName = "tablename"
    
    t0 = time.time()
    
    # connection
    dsn = cx_Oracle.makedsn('host','port',service_name='servicename')
    
    Base = declarative_base()
    
    class LANDMANMINERAL(Base):
        __tablename__ = 'tablename'
    
        DOCUMENTNUM = Column(String(500), primary_key=True)
        DOCUMENTTYPE = Column(String(500))
        FILENUM = Column(String(500))
        LEASEPAYOR = Column(String(500))
        LEASESTATUS = Column(String(500))
        PROSPECT = Column(String(500))
        SPLIT = Column(String(500))
        SPLITSTATUS = Column(String(500))
    
    engine = create_engine('oracle+cx_oracle://%s:%s@%s' % (username, password, dsn))
    conn = engine.connect()  
    
    Base.metadata.bind = engine
    
    # Creating the session
    
    DBSession = sessionmaker(bind=engine)
    
    session = DBSession()
    
    # Bulk insertion
    data = pd.read_csv('data.csv')
    lists = data.to_dict(orient='records')
    
    
    table = sqlalchemy.Table('landmanmineral', Base.metadata, autoreload=True)
    conn.execute(table.insert(), lists)
    
    session.commit()
    
    session.close() 
    
    print("time taken %8.8f seconds" % (time.time() - t0) )
    
    0 讨论(0)
提交回复
热议问题