简体   繁体   中英

Writing script for college sports class, keep getting error 'AttributeError: module 'scrapy' has no attribute 'spider''

This is my code, not sure what I am doing wrong here. Appreciate any help.

from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd

url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)

class WebSpider(scrapy.spider):
    name = "Web_Spider"
    allowed_domains = ['https://www.ufc.com/athletes']
    start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']

    def __init__(self):
        self.driver = driver

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')

            try:
                next.click()

            except:
                break

        self.driver.close()

I keep getting the error 'AttributeError: module 'scrapy' has no attribute 'spider''. Not sure what to do here, Scrapy is installed correctly and up-to-date.

这是scrapy.Spider,大写“s”

Try now: 

from selenium import webdriver
from bs4 import BeautifulSoup
import scrapy
from scrapy.spiders import Spider
import requests
import time
import xlsxwriter
import pandas as pd

url = 'https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A23'
driver = webdriver.Chrome('/Applications/Python 3.9/chromedriver')
driver.get(url)

class WebSpider(scrapy.Spider):
    name = "Web_Spider"
    allowed_domains = ['https://www.ufc.com/athletes']
    start_urls = ['https://www.ufc.com/athletes/all?filters%5B0%5D=status%3A2']

    def __init__(self):
        self.driver = driver

    def parse(self, response):
        self.driver.get(response.url)

        while True:
            next = self.drive.find_element_by_xpath('//*[@id="block-mainpagecontent"]/div/div/div[2]/div/div/ul/li/a')

            try:
                next.click()

            except:
                break

        self.driver.close()

Depending what you are trying to do, I wouldn't go with Selenium here as you can fetch the data directly through ajax. Selenium will still work but it's a bit overkill and less efficient.

Try this:

import requests
from bs4 import BeautifulSoup
import re


url = 'https://www.ufc.com/views/ajax?_wrapper_format=drupal_ajax'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36'}

page = 1
end_of_load = False
while end_of_load == False:
    payload = {
    'view_name': 'all_athletes',
    'view_display_id': 'page',
    'view_path': '/athletes/all',
    'pager_element': '0',
    'gender': 'All',
    'page': '%s' %page}
    
    
    jsonData = requests.post(url, headers=headers, data=payload).json()
    print('Page: %s' %page)
    page += 1
    
    html = jsonData[-1]['data']

    soup = BeautifulSoup(html, 'html.parser')
    
    player_cards = soup.find_all('div',{'class':re.compile('.*view-mode-all-athletes-result.*')})
    if not player_cards:
        end_of_load = True
  
    else:
        for player_card in player_cards:
            name = player_card.find('span',{'class':re.compile('.*athlete__name.*')}).text.strip()
            try:
                weight_class = player_card.find('div',{'class':re.compile('.*weight-class.*')}).text.strip()
            except:
                weight_class = 'N/A'
            try:
                record = player_card.find('span',{'class':re.compile('.*athlete__record.*')}).text.strip()
            except:
                record = 'N/A'
            print('\t%s - %s\t%s' %(name,weight_class,record))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM