Python urllib.error.HTTPError：HTTP错误404：找不到

Question

我正在尝试运行脚本以从站点中抓取新闻文本。

重组首页后，似乎对剪贴没有限制。

而且我一直收到此错误，但是如果我测试一个sing url，它就可以工作。

有什么建议么？

Traceback (most recent call last):
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 157, in <module>
    result = fetch_news_detail(news['href'])  
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\bigkinds_headers_v1.3_20170522.py", line 107, in fetch_news_detail
    res = urlopen(url).read().decode('utf-8', errors='ignore')
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
    response = meth(req, response)
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
    return self._call_chain(*args)
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
    result = func(*args)
  File "C:\Users\park\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

这是我的代码

import urllib.request
from bs4 import BeautifulSoup
import time
import urllib.parse
import json
from urllib.request import urlopen
import random

def fetch_news_list(page, keyword,start,end):

    result = []

    url = "https://www.bigkinds.or.kr/news/newsResult.do"

    headers =  {
       'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13F69 Safari/601.1', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'
        }

    param = {
        'pageInfo':'newsResult',
        'login_chk':'null',
        'LOGIN_SN':'null',
        'LOGIN_NAME':'null',
        'indexName':'news',
        'keyword':keyword,
        'byLine':'',
        'searchScope':'1',
        'searchFtr':'1',
        'startDate':start,
        'endDate':end,
        'sortMethod':'date',
        'contentLength':'100',
        'providerCode':'',
        'categoryCode':'',
        'incidentCode':'',
        'dateCode':'',
        'highlighting':'',
        'sessionUSID':'',
        'sessionUUID':'test',
        'listMode':'',
        'categoryTab':'',
        'newsId':'',
        'filterProviderCode':'',
        'filterCategoryCode':'',
        'filterIncidentCode':'',
        'filterDateCode':'',
        'startNo':page,
        'resultNumber':'100',
        'topmenuoff':'',
        'resultState':'',
        'keywordJson':'{"searchDetailTxt1":keyword,"agreeDetailTxt1":"","needDetailTxt1":"","exceptDetailTxt1":"","o_id":"option1","startDate":start,"endDate":end,"providerNm":"","categoryNm":"","incidentCategoryNm":"","providerCode":"","categoryCode":"","incidentCategoryCode":"","searchFtr":"1","searchScope":"1","searchKeyword":"keyword"}',
        'keywordFilterJson':'',
        'totalCount':'',
        'interval':'',
        'quotationKeyword1':'',
        'quotationKeyword2':'',
        'quotationKeyword3':'',
        'searchFromUseYN':'N',
        'mainTodayPersonYn':'',
        'period':'1year'
        }

    param = urllib.parse.urlencode(param).encode() 

    req = urllib.request.Request(url, param, headers)
    sleepTime = random.randint(4,10)
    time.sleep(sleepTime)
    print(str(sleepTime) + ' seconds wait.')

    try :

        res = urllib.request.urlopen(req)

    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code)



    html = res.read()

    soup = BeautifulSoup(html, "html.parser")

    div_tags = soup.find_all('div', class_='resTxt')

    for cts in div_tags:

        ids = cts.find('h3')['id'][5:31]
        title = cts.find('h3',class_='list_newsId').get_text(strip=True)
        href = 'https://www.bigkinds.or.kr/news/detailView.do?docId=' + ids + '&returnCnt=1'

        sets = {
            'title' : title,
            'href' : href
            }

        result. append(sets)

    return result

def fetch_news_detail(url):
    result = {}
    res = urlopen(url).read().decode('utf-8', errors='ignore')

    responseJson = json.loads(res)       

    category = responseJson.get("detail").get("CATEGORY_MAIN")
    date = responseJson.get("detail").get("DATE")
    provider = responseJson.get("detail").get("PROVIDER")
    content = responseJson.get("detail").get("CONTENT")

    result = {
        'category': category,
        'date': date,
        'provider' : provider,
        'content': content
    }

    return result


keyword = input('(eg., 외국인 NOT(증시 OR 순매수 OR 증권 OR 코스피 OR 코스닥 OR 주식 OR 주가 OR 투타 OR KBO OR 야구 OR KBL OR 농구 OR 축구 OR 올림픽 OR K리그))\n input word: ')
start = input('(eg., 2017-01-01)\n input startday: ')
end = input('(eg., 2017-02-01)\n input endday: ')

page = 1
count = 1
flag = True  

f = open('bigkinds.txt', 'w', encoding='utf-8')

while True:
    if not flag:
        break

    news_list = fetch_news_list(page, keyword,start,end) 
    sleepTime = random.randint(3,8)
    time.sleep(sleepTime)
    print(str(sleepTime) + ' seconds wait.')

    for news in news_list:
        result = fetch_news_detail(news['href']) 

        result['title'] = news['title']

        f.write('==' * 40 + '\n')
        f.write('category: ' + result['category'] + '\n')
        f.write('title: ' + result['title'] + '\n')
        f.write('date: ' + result['date'] + '\n')
        f.write('provider: ' + result['provider'] + '\n')
        f.write('content: ' + result['content'] + '\n')
        f.write('==' * 40 + '\n')

        count += 1
        if count >=5002:
            flag = False
            break
            sleepTime = random.randint(2,10)
            time.sleep(sleepTime)
            print(str(sleepTime) + ' seconds wait.')

    page += 1

f.close()

Answer 1

从Yelp抓取数据时遇到了此类问题。 很难绕过这些限制。 我建议您尝试以下方法。

更改您的用户代理。 当前的用户代理似乎是用于iPhone。 使用一台有效的PC。
如果上述方法可行，但是在发出一定数量的页面请求后您被阻止了，请查看StarCluster。 您可以在https://yangwangteaching.wordpress.com/data-science-meetup/下的“ 4/13/2017（UTEP CoBA 310 – CALC LAB＃2）”下找到代码草案。

Python urllib.error.HTTPError：HTTP错误404：找不到

问题描述

1 个解决方案

解决方案1
0 已采纳 2017-05-17 08:22:59

Python urllib.error.HTTPError：HTTP错误404：找不到

问题描述

1 个解决方案

解决方案1 0 已采纳 2017-05-17 08:22:59

解决方案1
0 已采纳 2017-05-17 08:22:59