[英]Bulk insert scrapy pipeline using sqlalchemy
我正在從網站上抓取大量數據,問題是通過將一個一個一個插入數據庫花費太多時間我正在尋找一種智能方法來批量插入或批量插入數據庫,所以它贏了'不要永遠將它推送到數據庫。 我正在使用sqlalchemy1.4
orm 和 scrapy 框架。
楷模:
from sqlalchemy import Column, Date, String, Integer, create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from . import settings
engine = create_engine(settings.DATABSE_URL)
Session = sessionmaker(bind=engine)
session = Session()
DeclarativeBase = declarative_base()
class Olx_Eg(DeclarativeBase):
"""
Defines the property listing model
"""
__tablename__ = "olx_egypt"
_id = Column(Integer, primary_key=True)
URL = Column("URL", String)
Breadcrumb = Column("Breadcrumb", String)
Price = Column("Price", String)
Title = Column("Title", String)
Type = Column("Type", String)
Bedrooms = Column("Bedrooms", String)
Bathrooms = Column("Bathrooms", String)
Area = Column("Area", String)
Location = Column("Location", String)
Compound = Column("Compound", String)
seller = Column("seller", String)
Seller_member_since = Column("Seller_member_since", String)
Seller_phone_number = Column("Seller_phone_number", String)
Description = Column("Description", String)
Amenities = Column("Amenities", String)
Reference = Column("Reference", String)
Listed_date = Column("Listed_date", String)
Level = Column("Level", String)
Payment_option = Column("Payment_option", String)
Delivery_term = Column("Delivery_term", String)
Furnished = Column("Furnished", String)
Delivery_date = Column("Delivery_date", String)
Down_payment = Column("Down_payment", String)
Image_url = Column("Image_url", String)
這是我現在的scrapy管道:
from olx_egypt.models import Olx_Eg, session
class OlxEgPipeline:
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates items table.
"""
def process_item(self, item, spider):
"""
Process the item and store to database.
"""
# session = self.Session()
instance = session.query(Olx_Eg).filter_by(Reference=item["Reference"]).first()
if instance:
return instance
else:
olx_item = Olx_Eg(**item)
session.add(olx_item)
try:
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
我嘗試創建一個列表並將項目附加到它,然后在關閉蜘蛛時將其推送到數據庫:
from olx_egypt.models import Olx_Eg, session
class ExampleScrapyPipeline:
def __init__(self):
self.items = []
def process_item(self, item, spider):
self.items.append(item)
return item
def close_spider(self, spider):
try:
session.bulk_insert_mappings(Olx_Eg, self.items)
session.commit()
except Exception as error:
session.rollback()
raise
finally:
session.close()
但它在session.bulk_insert_mappings(Olx_Eg, self.items)
這一行失敗了。 誰能告訴我如何制作scrapy管道批量或批量插入?
我實際上正在做一些非常相似的事情,並且已經建立了一個管道來使用pandas.to_sql
注入數據,需要的代碼行更少,而且它非常快,因為我已經激活了method='multi'
,如果你要上傳到mssql
然后您可以利用fast_executemany=True
,如這篇文章中所提供的: 使用 pyODBC 的 fast_executemany 加速 pandas.DataFrame.to_sql 。
我試圖使其盡可能通用以訪問不同的驅動程序名稱。
這是一個例子:
刮板.py
import scrapy
from scrapy_exercises.items import ScrapyExercisesItem
from scrapy.crawler import CrawlerProcess
class SQLTest(scrapy.Spider):
name = 'SQL'
start_urls = [f'https://quotes.toscrape.com/page/{i}/' for i in range(1, 11)]
custom_settings = {
"FEED": {"test" : {"format": "csv"}}
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("//div[@class='col-md-8']//div")
for items in content:
table = ScrapyExercisesItem()
#table._name= items.xpath(".//span//@href").get()
#table._keyword= items.xpath(".//div[@class = 'tags']//a[1]//text()").get()
#yield table.returnTable()
table['name'] = items.xpath(".//span//@href").get()
table['keyword'] = items.xpath(".//div[@class = 'tags']//a[1]//text()").get()
return table
項目.py
import scrapy
class ScrapyExercisesItem(scrapy.Item):
name = scrapy.Field()
keyword = scrapy.Field()
管道.py
from sqlalchemy import create_engine, String
import pandas as pd
import pyodbc
import logging
from itemadapter import is_item
from itemadapter import ItemAdapter
logger = logging.getLogger(__name__)
class DataframeSQLPipelineInject:
def __init__(self, user, passw, host, port, database, table, if_exists, drivername):
self._user = user
self._passw = passw
self._host = host
self._port = port
self._database = database
self.table = table
self.if_exists = if_exists
self.drivername = drivername
@classmethod
def from_crawler(cls, crawler):
return cls(
user = crawler.settings.get('DATABASE')['user'],
passw = crawler.settings.get('DATABASE')['passw'],
host = crawler.settings.get('DATABASE')['host'],
port = crawler.settings.get('DATABASE')['port'],
database = crawler.settings.get('DATABASE')['database'],
table = crawler.settings.get('DATABASE')['table'],
if_exists = crawler.settings.get('DATABASE')['if_exists'],
drivername = crawler.settings.get('DATABASE')['drivername']
)
def open_spider(self, spider):
self.engine = create_engine(
f'{self.drivername}://' + #change this to your required server
self._user + ':' +
self._passw + '@' +
self._host + ':' +
str(self._port) + '/' +
self._database ,#+f'?driver=ODBC+Driver+18+for+SQL+Server' , #change this to your required driver
echo=False,
#connect_args={"timeout":30},
pool_pre_ping=True
#fast_executemany=True
#--- Add if using drivername mssql+pyodbc,
#then remove if_exists = self.if_exists from table_df
)
self.conn = self.engine.connect()
def close_spider(self, spider):
self.conn.close()
def process_item(self,item, spider):
if is_item(item):
table_df = pd.DataFrame([ItemAdapter(item).asdict()])
print(table_df.dtypes)
table_df.to_sql(self.table, con=self.engine,method='multi',dtype={'name':String(), 'keyword':String()}, chunksize=2000, index=False, if_exists = self.if_exists)
else:
logger.error(f'You need a dict for item, you have type: {type(item)}')
設置.py:
DATABASE = {
"user": "usr",
"passw": "",
"host": "localhost",
"port": '5432',
"database": "scraper",
'table':'some_table',
'if_exists':'append',
'drivername':'postgresql'
}
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy_exercises.pipelines.sql_import.DataframeSQLPipelineInject':50
}
即使要創建表,也需要使用if_exists
並添加append
。 因為scrapy是單線程的,它會在每個反應器循環之后創建然后附加值。
我希望這有助於解決您的速度問題,因為我沒有使用大量數據進行測試。
它適用於我,檢查圖像:
用這個更新你的 items.py:
class ScrapyExercisesItem(scrapy.Item):
URL = scrapy.Field()
Breadcrumb = scrapy.Field()
Price = scrapy.Field()
Title = scrapy.Field()
Type = scrapy.Field()
Bedrooms = scrapy.Field()
Bathrooms = scrapy.Field()
Area = scrapy.Field()
Location = scrapy.Field()
keyword = scrapy.Field()
Compound = scrapy.Field()
seller = scrapy.Field()
Seller_member_since = scrapy.Field()
Seller_phone_number = scrapy.Field()
Description = scrapy.Field()
Amenities = scrapy.Field()
Reference = scrapy.Field()
Listed_date = scrapy.Field()
Level = scrapy.Field()
Payment_option = scrapy.Field()
Delivery_term = scrapy.Field()
Furnished = scrapy.Field()
Delivery_date = scrapy.Field()
Down_payment = scrapy.Field()
Image_url = scrapy.Field()
並在您的刮板中刪除以下內容:
item = {}
將其替換為:
from your_path.items import ScrapyExercisesItem
item = ScrapyExercisesItem()
然后不要yield
,而是return
。 它對我有用,所以它應該對你有用。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.