[英]How to call a Scrapy Spider in a different Python Script
我正在处理一个项目,我需要在不同的 python 文件中调用我的 Scrapy Spider,以便在处理之前更新以前抓取的数据。 我不确定我该怎么做。 我还在学习 scrapy 所以如果你能解释我应该做什么就太好了。
我的文件结构如下:
这里calculator.py 是我要调用我的蜘蛛的外部文件。
Calculator.py 的代码如下:
from typing import Union
import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk
from datetime import *
from pandas import DataFrame, Series
data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])
# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)
#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]
data = data.groupby(['Subject', 'Attend']).size().reset_index() \
.set_index(['Subject','Attend']) \
.unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])
print(data)
我的蜘蛛是spidey.py如下:
import scrapy
from scrapy.http import FormRequest
from ..items import AttendanceItem
from scrapy.utils.response import open_in_browser
import json
from scrapy.http.request.json_request import JsonRequest
class spidey(scrapy.Spider):
name = 'spidyy'
page_number = 2
start_urls = [
'http://app.bmiet.net/student/login'
]
def parse(self, response):
token = response.css('form input::attr(value)').extract_first()
return FormRequest.from_response(response, formdata={
'csrf_token' : token,
'username' : '*****USERNAME****',
'password' : '*****PASSWORD*****'
}, callback = self.start_scraping)
def start_scraping(self, response):
yield scrapy.Request('http://app.bmiet.net/student/attendance/view', callback=self.parse_att)
def parse_att(self, response):
items = AttendanceItem()
all = response.css('#dataTable tbody ')
for x in all:
att = x.css('td:nth-child(2)::text').extract()
sub = x.css('td:nth-child(3)::text').extract()
date = x.css('td:nth-child(1)::text').extract()
for b in range(0, len(date)):
date[b] = date[b].replace("\n ", "")
items["Date"] = date[b]
att[b] = att[b].replace(" \n \t\t", "")
items["Attend"] = att[b]
sub[b] = sub[b].replace(" \t\n \t\t ", "")
items["Subject"] = sub[b]
yield items
请帮忙。
我希望它会奏效。
from typing import Union
import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk
from datetime import *
from pandas import DataFrame, Series
#you should edit your Calculator.py with next two line code.
import os
os.system("cd attendance & scrapy crawl spidyy -o attendance/final.csv")
#until here
data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])
# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)
#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]
data = data.groupby(['Subject', 'Attend']).size().reset_index() \
.set_index(['Subject','Attend']) \
.unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])
print(data)
让我知道结果。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.