使用 beautifulsoup 抓取 url 列表并将数据转换为 csv

Question

我是 Python 的新手。 以下问题：

我有一个要从中抓取数据的 url 列表。 我不知道我的代码有什么问题，我无法从所有 url 中检索结果。 该代码仅抓取第一个 url 而不是 rest。 如何成功抓取列表中所有 url 中的数据（标题、信息、描述、应用程序）？
如果问题 1 有效，如何将数据转换为 CSV 文件？

这是代码：

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import lxml
import pandas as pd
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

urlList = ["url1","url2","url3"...lists of urls.......]

for url in urlList:
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
    except URLError:
        print("error")
    else:
        soup = BeautifulSoup(html.read(),"html5lib")
# Scraping
def getTitle():
    for title in soup.find('h2', class_="xx").text:
            print(title)

def getInfo():
   for info in soup.find('ul', class_="j-k-i").text:
        print(info)

def getDescription():
    for description in soup.find('div', class_="b-d").text:
        print(description)

def getApplication():
    for application in soup.find('div', class_="g-b bm-b-30").text:
       print(application)

for soups in soup():
    getTitle()
    getInfo()
    getDescription()
    getApplication()

Answer 1

尝试以下方法：

import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import lxml
import pandas as pd
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError


def getTitle(soup):
    for title in soup.find('h2', class_="xx").text:
            print(title)

def getInfo(soup):
    for info in soup.find('ul', class_="j-k-i").text:
        print(info)

def getDescription(soup):
    for description in soup.find('div', class_="b-d").text:
        print(description)

def getApplication(soup):
    for application in soup.find('div', class_="g-b bm-b-30").text:
       print(application)

urlList = ["url1","url2","url3"...lists of urls.......]

for url in urlList:
    try:
        html = urlopen(url)
    except HTTPError as e:
        print(e)
    except URLError:
        print("error")
    else:
        soup = BeautifulSoup(html.read(),"html5lib")

        getTitle(soup)
        getInfo(soup)
        getDescription(soup)
        getApplication(soup)

这会将当前的soup传递给每个 function 使用。

使用 beautifulsoup 抓取 url 列表并将数据转换为 csv

问题描述

1 个解决方案

解决方案1
0 2021-12-15 21:49:27

使用 beautifulsoup 抓取 url 列表并将数据转换为 csv

问题描述

1 个解决方案

解决方案1 0 2021-12-15 21:49:27

解决方案1
0 2021-12-15 21:49:27