简体   繁体   中英

How to call a Scrapy Spider in a different Python Script

I am working on aproject for which I need to call my Scrapy Spider in a different python file so as to update the previously scraped data before working on it. I am not sure how can I do that. I am still learning scrapy so it will be great if you could explain what I am supposed to do exactly.

My file structure is as follows:

在此处输入图像描述

Here calculator.py is the external file where I want to call my spider.

The code of Calculator.py is as follows:

from typing import Union

import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk



from datetime import *

from pandas import DataFrame, Series

data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])

# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
                 ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)

#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]

data = data.groupby(['Subject', 'Attend']).size().reset_index() \
    .set_index(['Subject','Attend']) \
    .unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])

print(data)

My spider is spidey.py is as follows:

import scrapy
from scrapy.http import FormRequest
from ..items import AttendanceItem
from scrapy.utils.response import open_in_browser
import json
from scrapy.http.request.json_request import JsonRequest

class spidey(scrapy.Spider):
    name = 'spidyy'
    page_number = 2
    start_urls = [
        'http://app.bmiet.net/student/login'
    ]

    def parse(self, response):
        token = response.css('form input::attr(value)').extract_first()
        return FormRequest.from_response(response, formdata={
            'csrf_token' : token,
             'username' : '*****USERNAME****',
             'password' : '*****PASSWORD*****'
        }, callback = self.start_scraping)

    def start_scraping(self, response):
        yield scrapy.Request('http://app.bmiet.net/student/attendance/view', callback=self.parse_att)

    def parse_att(self, response):
        items = AttendanceItem()
        all = response.css('#dataTable tbody ')
        for x in all:
            att = x.css('td:nth-child(2)::text').extract()
            sub = x.css('td:nth-child(3)::text').extract()
            date = x.css('td:nth-child(1)::text').extract()
            for b in range(0, len(date)):
                date[b] = date[b].replace("\n                                ", "")
                items["Date"] = date[b]
                att[b] = att[b].replace("   \n                          \t\t", "")
                items["Attend"] = att[b]
                sub[b] = sub[b].replace(" \t\n                      \t\t    ", "")
                items["Subject"] = sub[b]
                yield items

Please help.

I hope it will work.

from typing import Union

import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk



from datetime import *

from pandas import DataFrame, Series

#you should edit your Calculator.py with next two line code.
import os

os.system("cd attendance & scrapy crawl spidyy -o attendance/final.csv")

#until here

data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])

# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
                 ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)

#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]

data = data.groupby(['Subject', 'Attend']).size().reset_index() \
    .set_index(['Subject','Attend']) \
    .unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])

print(data)

Let me know the result.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM