繁体   English   中英

网址列表中的python web抓取

[英]python web scraping from a list of urls

我是python中的Asks和Trio的新手,我有一个示例代码。 让我解释一下,我有一个URL列表,每个URL是新闻URL,每个URL都有子URL。 第一个url请求并获取所有其他href并添加到列表中。 然后获取该列表中所有href的文章。 问题是某些时候文章变得空白。

尝试使用单个网址的示例代码,以确保其正常工作

import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time

asks.init('trio') 


async def extractor(path, htmls, paths, session):

    try:
        r = await session.get(path, timeout=2)
        out = r.content
        htmls.append(out)
        paths.append(path)
    except Exception as e:
        out = str(e)
        htmls.append(out)
        paths.append(path)


async def main(path_list, session):    
    htmls = []
    paths = []
    async with trio.open_nursery() as n:
        for path in path_list:
            n.start_soon(extractor, path, htmls, paths, session)

    return htmls, paths


async def run(urls, conns=50): 


    s = asks.Session(connections=conns)
    g = Goose()

    htmls, paths = await main(urls, s)
    print(htmls,"       ",paths)
    cleaned = []
    for html, path in zip(htmls, paths):
        dic = {}
        dic['url'] = path
        if html is not None:                            
            try:
                #g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
                article = g.extract(raw_html=html)
                author=article.authors
                dic['goose_text'] = article.cleaned_text
                #print(article.cleaned_text)
                #dic['goose_date'] = article.publish_datetime
                dic['goose_title'] = article.title
                if author:
                    dic['authors']=author[0]
                else:
                    dic['authors'] =''
            except Exception as e:
                raise
                print(e)
                log.info('goose found no text using html')
                dic['goose_html'] = html
                dic['goose_text'] = ''
                dic['goose_date'] = None
                dic['goose_title'] = None
                dic['authors'] =''
            cleaned.append(dic)
    return cleaned




async def real_main():
    sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'

    obj = json.loads(sss)
    pprint(obj)

    articles=[]
    for l in obj:
      articles.append(await run([l['story_url']]))
      #await trio.sleep(3)

    pprint(articles)

if __name__ == "__main__":
    trio.run(real_main)

获取文章数据而不会丢失

我缺少一些进一步的信息来深入回答您的问题,但很可能与鹅在html中搜索文本的方式有关。 请参阅此答案以获取更多详细信息: https : //stackoverflow.com/a/30408761/8867146

当状态码为!= 200时,“询问”并不总是引发异常。您需要在使用响应的内容之前检查其状态码。 您可能还想增加超时,仅2秒是不够的,尤其是当您并行触发多达50个连接时。

无论如何,这是一个简化的程序–所有Goose的东西对于显示实际错误都是完全不必要的,两个结果数组都不是一个好主意,并且向结果数组添加错误消息看起来很麻烦。

另外,您应该研究并行运行URL提取和处理。 trio.open_memory_channel是您的朋友在这里。


import asks
asks.init('trio')

import trio
from pprint import pprint

async def extractor(path, session, results):
    try:
        r = await session.get(path, timeout=2)
        if r.status_code != 200:
            raise asks.errors.BadStatus("Not OK",r.status_code)
        out = r.content
    except Exception as e:
        # do some reasonable error handling
        print(path, repr(e))
    else:
        results.append((out, path))

async def main(path_list, session):
    results = []
    async with trio.open_nursery() as n:
        for path in path_list:
            n.start_soon(extractor, path, session, results)
    return results


async def run(conns=50):
    s = asks.Session(connections=conns)

    urls = [
            "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries",
            "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project",
            "https://www.google.com",  # just for testing more parallel connections
            "https://www.debian.org",
            ]

    results = await main(urls, s)
    for content, path in results:
        pass  # analyze this result
    print("OK")

if __name__ == "__main__":
    trio.run(run)

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM