I am working on aproject for which I need to call my Scrapy Spider in a different python file so as to update the previously scraped data before working on it. I am not sure how can I do that. I am still learning scrapy so it will be great if you could explain what I am supposed to do exactly.
My file structure is as follows:
Here calculator.py is the external file where I want to call my spider.
The code of Calculator.py is as follows:
from typing import Union
import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk
from datetime import *
from pandas import DataFrame, Series
data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])
# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)
#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]
data = data.groupby(['Subject', 'Attend']).size().reset_index() \
.set_index(['Subject','Attend']) \
.unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])
print(data)
My spider is spidey.py is as follows:
import scrapy
from scrapy.http import FormRequest
from ..items import AttendanceItem
from scrapy.utils.response import open_in_browser
import json
from scrapy.http.request.json_request import JsonRequest
class spidey(scrapy.Spider):
name = 'spidyy'
page_number = 2
start_urls = [
'http://app.bmiet.net/student/login'
]
def parse(self, response):
token = response.css('form input::attr(value)').extract_first()
return FormRequest.from_response(response, formdata={
'csrf_token' : token,
'username' : '*****USERNAME****',
'password' : '*****PASSWORD*****'
}, callback = self.start_scraping)
def start_scraping(self, response):
yield scrapy.Request('http://app.bmiet.net/student/attendance/view', callback=self.parse_att)
def parse_att(self, response):
items = AttendanceItem()
all = response.css('#dataTable tbody ')
for x in all:
att = x.css('td:nth-child(2)::text').extract()
sub = x.css('td:nth-child(3)::text').extract()
date = x.css('td:nth-child(1)::text').extract()
for b in range(0, len(date)):
date[b] = date[b].replace("\n ", "")
items["Date"] = date[b]
att[b] = att[b].replace(" \n \t\t", "")
items["Attend"] = att[b]
sub[b] = sub[b].replace(" \t\n \t\t ", "")
items["Subject"] = sub[b]
yield items
Please help.
I hope it will work.
from typing import Union
import pandas as pd
import tkinter
from tkinter import *
from tkinter import messagebox
# import ttk
from datetime import *
from pandas import DataFrame, Series
#you should edit your Calculator.py with next two line code.
import os
os.system("cd attendance & scrapy crawl spidyy -o attendance/final.csv")
#until here
data = pd.read_csv(r"C:\Users\Administrator\PycharmProjects\Sarthak_project\attendance\attendance\final.csv", parse_dates= [1])
# sorting data frame by Team and then By names
data.sort_values(["Subject", "Date"], axis=0,
ascending=True, inplace=True)
p = 0
a = 0
total = 0
attpercent = {}
temp = ""
#yyyy-dd-mm
data = data.infer_objects()
data = data.drop_duplicates()
#date = datetime(2020, 7, 25)
#data['Date'] = pd.to_datetime(data["Date"].dt.strftime('%Y-%m-%d'))
#data = data.loc[data['Date'] > date.strftime('%Y-%d-%m')]
data = data.groupby(['Subject', 'Attend']).size().reset_index() \
.set_index(['Subject','Attend']) \
.unstack(1).fillna(0).astype(int)
data.columns = data.columns.droplevel(0)
data['Attendance'] = data['Present'] * 100 / ( data['Present'] + data['Absent'])
print(data)
Let me know the result.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.