I'm trying to use an SQLAlchemy expression with dask's read_sql_table in order to bring down a dataset that is created by joining and filtering a few different tables. The documentation indicates that this should be possible.
(The example below, does not include any joins as they are not needed to replicate the problem.)
I build my connection string, create an SQLAlchemy engine and table corresponding to a table in my database. (I'm using PostgreSQL.)
import dask.dataframe as dd
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import Column, MetaData, Table
from sqlalchemy.sql import select
username = 'username'
password = 'password'
server = 'prod'
database = 'my_db'
connection_string = f'postgresql+psycopg2://{username}:{password}@{server}/{database}'
engine = create_engine(connection_string)
metadata = MetaData()
t = Table('my_table', metadata,
Column('id'),
schema='my_schema')
I'm able to build a select and use it with SQLAlchemy with no issue
>>> s = select([t]).limit(5)
>>> rp = engine.execute(s)
>>> rp.fetchall()
[(3140757,), (3118225,), (3156070,), (3193075,), (3114614,)]
I'm also able to feed the SQLAlchey select to panda's read_sql, which works fine
>>> pd.read_sql(s, connection_string)
id
0 3140757
1 3118225
2 3156070
3 3193075
4 3114614
However, when I pass the same select to dask, I get a ProgrammingError. It shows that dask is turning around and calling pandas.read_sql, so you would think it should work, but something is obviously not.
>>> dd.read_sql_table(s, connection_string, index_col='id')
---------------------------------------------------------------------------
ProgrammingError Traceback (most recent call last)
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
508 def do_execute(self, cursor, statement, parameters, context=None):
--> 509 cursor.execute(statement, parameters)
510
ProgrammingError: subquery in FROM must have an alias
LINE 2: FROM (SELECT my_schema.my_table.id AS id
^
HINT: For example, FROM (SELECT ...) [AS] foo.
The above exception was the direct cause of the following exception:
ProgrammingError Traceback (most recent call last)
<ipython-input-5-0db95e60f442> in <module>
----> 1 dd.read_sql_table(s, connection_string, index_col='id')
C:\miniconda3\envs\my_env\lib\site-packages\dask\dataframe\io\sql.py in read_sql_table(table, uri, index_col, divisions, npartitions, limits, columns, bytes_per_chunk, head_rows, schema, meta, engine_kwargs, **kwargs)
116 # derrive metadata from first few rows
117 q = sql.select(columns).limit(head_rows).select_from(table)
--> 118 head = pd.read_sql(q, engine, **kwargs)
119
120 if head.empty:
C:\miniconda3\envs\my_env\lib\site-packages\pandas\io\sql.py in read_sql(sql, con, index_col, coerce_float, params, parse_dates, columns, chunksize)
395 sql, index_col=index_col, params=params,
396 coerce_float=coerce_float, parse_dates=parse_dates,
--> 397 chunksize=chunksize)
398
399
C:\miniconda3\envs\my_env\lib\site-packages\pandas\io\sql.py in read_query(self, sql, index_col, coerce_float, parse_dates, params, chunksize)
1061 args = _convert_params(sql, params)
1062
-> 1063 result = self.execute(*args)
1064 columns = result.keys()
1065
C:\miniconda3\envs\my_env\lib\site-packages\pandas\io\sql.py in execute(self, *args, **kwargs)
952 def execute(self, *args, **kwargs):
953 """Simple passthrough to SQLAlchemy connectable"""
--> 954 return self.connectable.execute(*args, **kwargs)
955
956 def read_table(self, table_name, index_col=None, coerce_float=True,
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in execute(self, statement, *multiparams, **params)
2073
2074 connection = self.contextual_connect(close_with_result=True)
-> 2075 return connection.execute(statement, *multiparams, **params)
2076
2077 def scalar(self, statement, *multiparams, **params):
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in execute(self, object, *multiparams, **params)
946 raise exc.ObjectNotExecutableError(object)
947 else:
--> 948 return meth(self, multiparams, params)
949
950 def _execute_function(self, func, multiparams, params):
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\sql\elements.py in _execute_on_connection(self, connection, multiparams, params)
267 def _execute_on_connection(self, connection, multiparams, params):
268 if self.supports_execution:
--> 269 return connection._execute_clauseelement(self, multiparams, params)
270 else:
271 raise exc.ObjectNotExecutableError(self)
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in _execute_clauseelement(self, elem, multiparams, params)
1058 compiled_sql,
1059 distilled_params,
-> 1060 compiled_sql, distilled_params
1061 )
1062 if self._has_events or self.engine._has_events:
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1198 parameters,
1199 cursor,
-> 1200 context)
1201
1202 if self._has_events or self.engine._has_events:
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1411 util.raise_from_cause(
1412 sqlalchemy_exception,
-> 1413 exc_info
1414 )
1415 else:
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\util\compat.py in raise_from_cause(exception, exc_info)
263 exc_type, exc_value, exc_tb = exc_info
264 cause = exc_value if exc_value is not exception else None
--> 265 reraise(type(exception), exception, tb=exc_tb, cause=cause)
266
267 if py3k:
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\util\compat.py in reraise(tp, value, tb, cause)
246 value.__cause__ = cause
247 if value.__traceback__ is not tb:
--> 248 raise value.with_traceback(tb)
249 raise value
250
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1191 statement,
1192 parameters,
-> 1193 context)
1194 except BaseException as e:
1195 self._handle_dbapi_exception(
C:\miniconda3\envs\my_env\lib\site-packages\sqlalchemy\engine\default.py in do_execute(self, cursor, statement, parameters, context)
507
508 def do_execute(self, cursor, statement, parameters, context=None):
--> 509 cursor.execute(statement, parameters)
510
511 def do_execute_no_params(self, cursor, statement, context=None):
ProgrammingError: (psycopg2.ProgrammingError) subquery in FROM must have an alias
LINE 2: FROM (SELECT my_schema.my_table.id AS id
^
HINT: For example, FROM (SELECT ...) [AS] foo.
[SQL: 'SELECT id \nFROM (SELECT my_schema.my_table.id AS id \nFROM my_schema.my_table \n LIMIT %(param_1)s) \n LIMIT %(param_2)s'] [parameters: {'param_1': 5, 'param_2': 5}] (Background on this error at: http://sqlalche.me/e/f405)
For any others that run across this question. read_sql_table does not seem to support this use case (at this time). If you pass in an SQLAlchemy Select object, it ends up getting wrapped in another SQLAlchemy Select and without an alias, which is bad SQL (at least for PostgreSQL).
Looking at read_sql_table from the dask source, table is the Select object that is passed to read_sql_table and as seen, it gets wrapped in another select.
q = sql.select(columns).where(sql.and_(index >= lower, cond)
).select_from(table)
The good news is the read_sql_table function is relatively straight forward and the magic is really only a couple lines that create a dataframe from a delayed objects. You just need to write your own logic to beak the query into chunks
parts = []
for query_chunk in queries:
parts.append(delayed(_read_sql_chunk)(q, uri, meta, **kwargs))
return from_delayed(parts, meta, divisions=divisions)
def _read_sql_chunk(q, uri, meta, **kwargs):
df = pd.read_sql(q, uri, **kwargs)
if df.empty:
return meta
else:
return df.astype(meta.dtypes.to_dict(), copy=False)
As Chris said in a different answer, Dask wraps your query in something of a form SELECT columns FROM (yourquery)
, which is an invalid syntax for PostgreSQL, because it expects an alias for that parenthesised expression. Without reimplementing the whole read_sql_table
method, the expression can be aliased simply by adding .alias('somename')
to your select, ie
select([t]).limit(5).alias('foo')
That expression, when wrapped by Dask, generates correct syntax for Postgres
SELECT columns FROM (yourquery) AS foo
The query sent on that line is auto-generated by SQLAlchemy, so the syntax ought to be correct. However, I notice that your original query includes a .limit()
modifier. The purpose of the line head =
is to get the first few rows, to infer types. If the original query already has a limit clause, I can see that the two might conflict. Please try using a query without .limit()
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.