[英]Scraping Dawn news website returns (referer:None)
我的新聞網站的剪貼代碼返回的結果(參考:無)是以下代碼,我已經為BBC嘗試了相同的代碼,並且效果很好,但對於該網站,它沒有返回所需的結果。
import os
import scrapy
newpath = 'urdu_data'
if not os.path.exists(newpath):
os.makedirs(newpath)
class UrduSpider(scrapy.Spider):
name = "urdu"
start_urls = [
'https://www.dawnnews.tv',
'https://www.dawnnews.tv/latest-news'
'https://www.dawnnews.tv/news'
'https://www.dawnnews.tv/tech'
]
def should_process_page(self, page_url):
for s_url in self.start_urls:
if page_url.startswith(s_url) and page_url != s_url:
return True
return False
def parse(self, response):
if self.should_process_page(response.url):
page_id = response.url.split("/")[-1]
filename = page_id + '.txt'
# if response has story body, we save it's contents
story_body = response.css('div.story__content')
story_paragraphs_text = story_body.css('p::text')
page_data = ''
for p in story_paragraphs_text:
page_data += p.extract() + '\n'
if page_data:
open('urdu_data/' + filename, 'w').write(page_data)
# Now follow any links that are present on the page
links = response.css('a.title-link ::attr(href)').extract()
for link in links:
yield scrapy.Request(
response.urljoin(link),
callback=self.parse
)
我認為您需要像下面這樣的start_urls
start_urls = [
'https://www.dawnnews.tv',
'https://www.dawnnews.tv/latest-news',
'https://www.dawnnews.tv/news',
'https://www.dawnnews.tv/tech'
]
您在上面的代碼中沒有用逗號分隔的網址,因此僅需兩個網址,第一個附加,其他三個附加並用作一個網址,請按上述在每個網址后添加逗號
接下來的story_body = response.css('div.story__content')
意味着在url給定的頁面中應該有一個class = story__content的div元素,我認為在提到的url中是缺少的。 https://www.dawnnews.tv似乎有一個如story__excerpt之類的div類,不確定是否是您所需要的。無論如何,您需要檢查這些頁面的html並獲取正確的內容。
017-10-23 22:11:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.dawnnews.tv> (referer: None)
https://www.dawnnews.tv
2017-10-23 22:11:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.dawnnews.tv/news> (referer: None)
https://www.dawnnews.tv/news
news.txt
[]
2017-10-23 22:11:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.dawnnews.tv/tech> (referer: None)
https://www.dawnnews.tv/tech
tech.txt
[<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">فیس '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">یوٹی'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">واٹس'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">ویب '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">ابھی'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">8 سا'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">اسما'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">دنیا'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">فیس '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">سوشل'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt "> فیس'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">اگر '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">اس ف'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">بہت '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">اب پ'>]
2017-10-23 22:11:24 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.dawnnews.tv/latest-news> (referer: None)
https://www.dawnnews.tv/latest-news
latest-news.txt
[<Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">فلم '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">فیس '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">چیئر'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">غذا '>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">جوڈی'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">ہولی'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" data='<div class="story__excerpt ">پاکس'>, <Selector xpath="descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' story__excerpt ')]" ">
以上代碼
import os
import scrapy
newpath = 'urdu_data'
if not os.path.exists(newpath):
os.makedirs(newpath)
class UrduSpider(scrapy.Spider):
name = "urdu"
start_urls = [
'https://www.dawnnews.tv',
'https://www.dawnnews.tv/latest-news',
'https://www.dawnnews.tv/news',
'https://www.dawnnews.tv/tech'
]
def should_process_page(self, page_url):
for s_url in self.start_urls:
if page_url.startswith(s_url) and page_url != s_url:
return True
return False
def parse(self, response):
print(response.url)
if self.should_process_page(response.url):
page_id = response.url.split("/")[-1]
filename = page_id + '.txt'
print(filename)
# if response has story body, we save it's contents
story_body = response.css('div.story__excerpt')
print(story_body)
story_paragraphs_text = story_body.css('p::text')
page_data = ''
for p in story_paragraphs_text:
page_data += p.extract() + '\n'
if page_data:
open('urdu_data/' + filename, 'w').write(page_data)
# Now follow any links that are present on the page
links = response.css('a.title-link ::attr(href)').extract()
for link in links:
yield scrapy.Request(
response.urljoin(link),
callback=self.parse
)
您需要進行類似的更改,以基於頁面的html結構從其他元素獲取響應。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.