问题
I am dealing with a huge table where I have to do query. I decided to do so by chunking my data based on user_id and every time read and write into the sql.
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://')
q1 = "SELECT max(id) FROM users"
max_users = pd.read_sql(q1, engine)
max_users = max_users.iloc[0][0]
# since user_ids start from 1 to ... I make the split based on that
data = range(max_users)
chunks = [list(data[x:x+1000]) for x in range(0, len(data), 1000)]
def make_q(userid):
q2 = "SELECT alotofusers from bigtable WHERE userid in (" + str(','.join(str(e) for e in userid)) + ")"
from multiprocessing import Pool, TimeoutError
import time
import os
table_name = "user_type_tmp6"
def f(q):
df = pd.read_sql(q, engine)
df.to_sql(con=engine, name=table_name, if_exists='append')
pool = Pool(processes=10) # start 4 worker processes
pool.map(f, [make_q(item) for item in chunks[0:3]])
In fact my table only get populated by the first chunck but I get the following error
Exception during reset or similar
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 680, in _finalize_fairy
fairy._reset(pool)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 867, in _reset
pool._dialect.do_rollback(self)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
dbapi_connection.rollback()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 430, in rollback
self._read_ok_packet()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 394, in _read_ok_packet
pkt = self._read_packet()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 671, in _read_packet
% (packet_number, self._next_seq_id))
pymysql.err.InternalError: Packet sequence number wrong - got 48 expected 1
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
cursor, statement, parameters, context
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
cursor.execute(statement, parameters)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
result = self._query(query)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
conn.query(q)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
result.read()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
first_packet = self.connection._read_packet()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 671, in _read_packet
% (packet_number, self._next_seq_id))
pymysql.err.InternalError: Packet sequence number wrong - got 114 expected 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 733, in _rollback_impl
self.engine.dialect.do_rollback(self.connection)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
dbapi_connection.rollback()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 429, in rollback
self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 750, in _execute_command
raise err.InterfaceError("(0, '')")
pymysql.err.InterfaceError: (0, '')
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "user_app_usage_type.py", line 90, in f
df = pd.read_sql(q, engine) # index_col = 'user_id'
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 436, in read_sql
chunksize=chunksize,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1218, in read_query
result = self.execute(*args)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1087, in execute
return self.connectable.execute(*args, **kwargs)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2182, in execute
return connection.execute(statement, *multiparams, **params)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 976, in execute
return self._execute_text(object_, multiparams, params)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1149, in _execute_text
parameters,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1250, in _execute_context
e, statement, parameters, cursor, context
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1471, in _handle_dbapi_exception
self._autorollback()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/langhelpers.py", line 79, in __exit__
compat.reraise(type_, value, traceback)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 153, in reraise
raise value
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1471, in _handle_dbapi_exception
self._autorollback()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 861, in _autorollback
self._root._rollback_impl()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 735, in _rollback_impl
self._handle_dbapi_exception(e, None, None, None, None)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1384, in _handle_dbapi_exception
exc_info,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
raise value.with_traceback(tb)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 733, in _rollback_impl
self.engine.dialect.do_rollback(self.connection)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/dialects/mysql/base.py", line 2302, in do_rollback
dbapi_connection.rollback()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 429, in rollback
self._execute_command(COMMAND.COM_QUERY, "ROLLBACK")
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 750, in _execute_command
raise err.InterfaceError("(0, '')")
sqlalchemy.exc.InterfaceError: (pymysql.err.InterfaceError) (0, '')
(Background on this error at: http://sqlalche.me/e/rvf5)
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "user_app_usage_type.py", line 109, in <module>
pool.map(f, [make_q(item) for item in chunks[0:3]])
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 657, in get
raise self._value
sqlalchemy.exc.InterfaceError: (pymysql.err.InterfaceError) (0, '')
(Background on this error at: http://sqlalche.me/e/rvf5)
I am guess I am doing the multiprocessing not correct ! Or perhaps the sqlalchemy is not aligned with the pooling.
Update
From my understand by reading this and this suggested by Ilja I updated my worked function as following
def f(q):
engine = create_engine('mysql+pymysql://')
df = pd.read_sql(q, engine, index_col = 'user_id')
df.fillna(0, inplace = True)
df.to_csv('tmp.csv')
df.to_sql(con=engine, name=table_name, if_exists='append' )
engine.dispose()
but now I get errors like
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
cursor, statement, parameters, context
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
cursor.execute(statement, parameters)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
result = self._query(query)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
conn.query(q)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
result.read()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
first_packet = self.connection._read_packet()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 684, in _read_packet
packet.check_error()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/protocol.py", line 220, in check_error
err.raise_mysql_exception(self._data)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/err.py", line 109, in raise_mysql_exception
raise errorclass(errno, errval)
pymysql.err.InternalError: (1050, "Table 'users_usage_frequency_oly_12' already exists")
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
return list(map(*args))
File "user_login_type.py", line 103, in f
df.to_sql(con=engine, name=table_name, schema = 'datateam', if_exists='append' ) # dtype={'user_type': Enum('Browser', 'Hoarder', 'Mementor', 'Explorer', 'Lister', 'Scanner') }
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/core/generic.py", line 2712, in to_sql
method=method,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 518, in to_sql
method=method,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 1319, in to_sql
table.create()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 656, in create
self._execute_create()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pandas/io/sql.py", line 638, in _execute_create
self.table.create()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/schema.py", line 870, in create
bind._run_visitor(ddl.SchemaGenerator, self, checkfirst=checkfirst)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 2049, in _run_visitor
conn._run_visitor(visitorcallable, element, **kwargs)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1618, in _run_visitor
visitorcallable(self.dialect, self, **kwargs).traverse_single(element)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/visitors.py", line 138, in traverse_single
return meth(obj, **kw)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/ddl.py", line 826, in visit_table
include_foreign_key_constraints,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 982, in execute
return meth(self, multiparams, params)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/sql/ddl.py", line 72, in _execute_on_connection
return connection._execute_ddl(self, multiparams, params)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1044, in _execute_ddl
compiled,
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1250, in _execute_context
e, statement, parameters, cursor, context
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1476, in _handle_dbapi_exception
util.raise_from_cause(sqlalchemy_exception, exc_info)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 398, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/util/compat.py", line 152, in reraise
raise value.with_traceback(tb)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/base.py", line 1246, in _execute_context
cursor, statement, parameters, context
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 581, in do_execute
cursor.execute(statement, parameters)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 170, in execute
result = self._query(query)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/cursors.py", line 328, in _query
conn.query(q)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 517, in query
self._affected_rows = self._read_query_result(unbuffered=unbuffered)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 732, in _read_query_result
result.read()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 1075, in read
first_packet = self.connection._read_packet()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/connections.py", line 684, in _read_packet
packet.check_error()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/protocol.py", line 220, in check_error
err.raise_mysql_exception(self._data)
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/site-packages/pymysql/err.py", line 109, in raise_mysql_exception
raise errorclass(errno, errval)
sqlalchemy.exc.InternalError: (pymysql.err.InternalError) (1050, "Table 'tmp_oly_12' already exists")
[SQL:
CREATE TABLE tmp_oly_12 (
user_id BIGINT,
total_logins BIGINT,
distinct_month BIGINT,
freq TEXT,
lastlogin DATETIME,
typelastlog TEXT
)
]
(Background on this error at: http://sqlalche.me/e/2j85)
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "user_login_type.py", line 125, in <module>
pool.map(f, [make_q(item) for item in chunks[0:3]])
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 268, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/Users/opt/anaconda3/envs/UserExperience/lib/python3.7/multiprocessing/pool.py", line 657, in get
raise self._value
sqlalchemy.exc.InternalError: (pymysql.err.InternalError) (1050, "Table 'users_usage_frequency_oly_12' already exists")
[SQL:
CREATE TABLE tmp_oly_12 (
user_id BIGINT,
total_logins BIGINT,
distinct_month BIGINT,
freq TEXT,
lastlogin DATETIME,
I can see the table tmp_oly_12
is populated, not fully - but still I get this error ...
来源:https://stackoverflow.com/questions/60092814/reading-and-writing-to-sql-using-pandas-through-multiprocessing