简体   繁体   English

使用 Python 中的特定文本进行 Web 抓取 HTML 表

[英]Web scraping HTML table with certain text in Python

I am trying to web scrape a HTML table using python.我正在尝试使用 python 抓取 HTML 表格。 There are many tables in the HTML page, but i want to scrape a certain table only. HTML 页面中有很多表格,但我只想抓取某个表格。 I am using beautiful soup to do this web scraping.我正在使用美丽的汤来进行网络抓取。

My code looks like this:我的代码如下所示:

page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

for p in html.select('tr'):
    if p.text == "ARGOR CAST BAR":
        print (p.text)

I would like only the table that reads "Rate as at Monday, 10 September 2018".我只想要写着“Rate as at Monday, 10 September, 2018”的表格。

How do I go about doing that?我该怎么做?

You need to find the elements that contains the text and the parent that is a table:您需要找到包含文本的元素和作为表格的父元素:

import re
import requests
from bs4 import BeautifulSoup

page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))
from collections import defaultdict

import requests
from bs4 import BeautifulSoup


def get_page_html(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text


def parse_last_table(html):
    prev_key = None
    result = defaultdict(list)
    soup = BeautifulSoup(html, 'lxml')

    last_table = soup.find_all('table')[-1]
    for row in last_table.find_all('tr')[2:]:
        try:
            description, currency, unit, bank_sells, bank_buys = (
                col.text.strip() for col in row.find_all('td')
            )
        except ValueError:
            continue  # blank/empty row

        description = description or prev_key
        result[description].append({
            'currency': currency,
            'unit': unit,
            'bank_sells': bank_sells,
            'bank_buys': bank_buys
        })
        prev_key = description
    return result

Output:输出:

>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
  "ARGOR CAST BAR": [
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,369.00 (+4.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "CAST BARS": [
    {
      "currency": "SGD",
      "unit": "1 KILOBAR",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD CERTIFICATE": [
    {
      "currency": "SGD",
      "unit": "1 KILOCERT",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD SAVINGS A/C": [
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "53.20 (+0.04)",
      "bank_buys": "52.94 (+0.04)"
    }
  ],
  "GOLD BULLION COINS": [
    {
      "currency": "SGD",
      "unit": "1/20 OZ(GNC,SLC &GML)",
      "bank_sells": "131.00",
      "bank_buys": "81.00"
    },
    {
      "currency": "SGD",
      "unit": "1/10 OZ",
      "bank_sells": "211.00 (+1.00)",
      "bank_buys": "163.00"
    },
    {
      "currency": "SGD",
      "unit": "1/4 OZ",
      "bank_sells": "465.00",
      "bank_buys": "410.00"
    },
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "904.00 (+1.00)",
      "bank_buys": "822.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,726.00 (+1.00)",
      "bank_buys": "1,645.00 (+1.00)"
    }
  ],
  "PAMP GOLD BARS": [
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "876.00",
      "bank_buys": "821.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "82.00",
      "bank_buys": "50.00"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,711.00 (+1.00)",
      "bank_buys": "1,644.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "2.5 GM",
      "bank_sells": "182.00",
      "bank_buys": "130.00"
    },
    {
      "currency": "SGD",
      "unit": "5 GM",
      "bank_sells": "322.00",
      "bank_buys": "262.00"
    },
    {
      "currency": "SGD",
      "unit": "10 GM",
      "bank_sells": "597.00 (+1.00)",
      "bank_buys": "527.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "20 GM",
      "bank_sells": "1,132.00 (+1.00)",
      "bank_buys": "1,056.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "50 GM",
      "bank_sells": "2,746.00 (+2.00)",
      "bank_buys": "2,644.00 (+2.00)"
    },
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,414.00 (+3.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "SILVER PASSBOOK ACCOUNT": [
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "19.86 (+0.09)",
      "bank_buys": "19.30 (+0.09)"
    }
  ]
}

I believe this code will help you.我相信这段代码会对你有所帮助。 If you want complete running project visit html to pdf Web scraping如果你想要完整的运行项目访问html to pdf 网页抓取

import logging
import math
import json
from flask import jsonify, abort, make_response
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pydf

from constants import Constants
from response import Response


class SeleniumCrawler(object):

    def get_page(self, url):
        response = Response()
        try:

            # Initilized the chrome driver
            print("Initilized the chrome driver")
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1420,1080')
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            browser = webdriver.Chrome(chrome_options=chrome_options)

            # browser url
            browser.get(url)
            delay = 10000

            # wait till specific classes appears
            print("wait till specific classes appears")
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'kbn-table')))
            body = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML')

            # calculate number of pages exists and loop them
            print("calculate number of pages exists and loop them")
            pages = (str(browser.find_element_by_class_name("kuiToolBarText").text).split(" ")[2]).replace(",","")
            pages = math.ceil(int(pages) / 50) - 1

            print("pages found {}".format(pages))
            for page in range(1, pages):

browser.execute_script("document.getElementsByClassName('kuiButton')[1].click()")
               chunk = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML').replace("<tbody>", "")
               body += chunk`enter code here`

            # apply table tags and generate pdf
            print("apply table tags and generate pdf")
            pdf = pydf.generate_pdf("<table>" + body + "</table>")
            with open('out.pdf', 'wb') as f:
                f.write(pdf)

            return json.loads(json.dumps((response.get_response(Constants.SUCCESS, Constants.SUCCESS))))
        except Exception as e:
            logging.exception(e)

            return abort(make_response(jsonify(response.get_response(Constants.SERVER_ERROR, Constants.SERVER_ERROR)), response.get_code(Constants.SERVER_ERROR)))

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM