简体   繁体   中英

perform upsert operation on postgres like pandas to_sql function using python

Before asking this question, I have read many links about UPSERT operation on Postgres:

But the question is different from them, since the functionality is different. What I want is to implement something like pandas to_sql function which has the following features:

  • Automatically creates table
  • Keeps the data types of each column

The only drawback of to_sql is that it doesn't UPSERT operation on Postgres. Is there anyway to implement the expected functionality (automatically create table based on columns, perform UPSERT operation and keep data types) by passing dataframe to it?

Previously implemented code using Pandas to_sql function :

class PostgreSQL:
    def __init__(self):
        postgres_config = config_dict[Consts.POSTGRES.value]
        self.host = postgres_config[Consts.HOST.value]
        self.port = postgres_config[Consts.PORT.value]
        self.db_name = postgres_config[Consts.DB_NAME.value]
        self.username = postgres_config[Consts.USERNAME.value]
        self.password = postgres_config[Consts.PASSWORD.value]
    
    def get_connection(self) -> object:
        url_schema = Consts.POSTGRES_URL_SCHEMA.value.format(
            self.username, self.password, self.host, self.port, self.db_name
        )
        try:
            engine = create_engine(url_schema)
            return engine
        except Exception as e:
            logger.error('Make sure you have provided correct credentials for the DB connection.')
            raise e


    def save_df_to_db(self, df: object, table_name: str) -> None:
        df.to_sql(table_name, con=self.get_connection(), if_exists='append')

I have written a very generic code that performs UPSERT which is not supported officially in Postgres (until December 2021), using Pandas dataframe and in an efficient way.

By using the following code, it will update the existing primary key otherwise it will create a new table (in case table name doesn't exist) and add new records to the table.

Code :

import numpy as np
import pandas as pd
from sqlalchemy import create_engine, Table
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.ext.automap import automap_base


class PostgreSQL:
    def __init__(self):
        postgres_config = config_dict[Consts.POSTGRES.value]
        self.host = postgres_config[Consts.HOST.value]
        self.port = postgres_config[Consts.PORT.value]
        self.db_name = postgres_config[Consts.DB_NAME.value]
        self.username = postgres_config[Consts.USERNAME.value]
        self.password = postgres_config[Consts.PASSWORD.value]
    
    def get_connection(self) -> object:
        url_schema = 'postgresql://{}:{}@{}:{}/{}'.format(
            self.username, self.password, self.host, self.port, self.db_name
        )
        try:
            engine = create_engine(url_schema)
            return engine
        except Exception as e:
            logger.error('Make sure you have provided correct credentials for the DB connection.')
            raise e

    def run_query(self, query: str) -> list:
        engine = self.get_connection()
        return engine.execute(query).fetchall()

    def save_df_to_db(self, df: object, table_name: str) -> None:
        root_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
        engine = self.get_connection()
        add_primary_key_query = get_query(root_dir, Directories.COMMON.value, 'add_primary_key.sql', table_name)
        table_existence_query = get_query(root_dir, Directories.COMMON.value, 'table_existence.sql', table_name)
        if not engine.execute(table_existence_query).first()[0]:  # if table does not exist
            logger.info('Create table automatically and from scratch!')
            df.to_sql(table_name, con=self.get_connection(), if_exists='append')
            engine.execute(add_primary_key_query)
        else:
            try:
                df = df.replace("NaT", None)
                df = df.replace(pd.NaT, None)
                df = df.replace({pd.NaT: None})
                df_dict = df.to_dict('records')
            except AttributeError as e:
                logger.error('Empty Dataframe!')
                raise e
            with engine.connect() as connection:
                logger.info('Table already exists!')
                base = automap_base()
                base.prepare(engine, reflect=True,)
                target_table = Table(table_name, base.metadata,
                                autoload=True, autoload_with=engine,)

                chunks = [df_dict[i:i + 1000] for i in range(0, len(df_dict), 1000)]
                for chunk in chunks:
                    stmt = insert(target_table).values(chunk)
                    update_dict = {c.name: c for c in stmt.excluded if not c.primary_key}
                    connection.execute(stmt.on_conflict_do_update(
                        constraint=f'{table_name}_pkey',
                        set_=update_dict)
                    )


                logger.info('Saving data is successfully done.')

Table existence query :

SELECT EXISTS (
    SELECT FROM information_schema.tables 
    WHERE  table_schema = 'public'
    AND    table_name   = '{}'
);

Add primary key query :

ALTER TABLE {} add primary key (id);

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM