简体   繁体   中英

Web scraping HTML table with certain text in Python

I am trying to web scrape a HTML table using python. There are many tables in the HTML page, but i want to scrape a certain table only. I am using beautiful soup to do this web scraping.

My code looks like this:

page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

for p in html.select('tr'):
    if p.text == "ARGOR CAST BAR":
        print (p.text)

I would like only the table that reads "Rate as at Monday, 10 September 2018".

How do I go about doing that?

You need to find the elements that contains the text and the parent that is a table:

import re
import requests
from bs4 import BeautifulSoup

page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')

element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))
from collections import defaultdict

import requests
from bs4 import BeautifulSoup


def get_page_html(url):
    r = requests.get(url)
    r.raise_for_status()
    return r.text


def parse_last_table(html):
    prev_key = None
    result = defaultdict(list)
    soup = BeautifulSoup(html, 'lxml')

    last_table = soup.find_all('table')[-1]
    for row in last_table.find_all('tr')[2:]:
        try:
            description, currency, unit, bank_sells, bank_buys = (
                col.text.strip() for col in row.find_all('td')
            )
        except ValueError:
            continue  # blank/empty row

        description = description or prev_key
        result[description].append({
            'currency': currency,
            'unit': unit,
            'bank_sells': bank_sells,
            'bank_buys': bank_buys
        })
        prev_key = description
    return result

Output:

>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
  "ARGOR CAST BAR": [
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,369.00 (+4.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "CAST BARS": [
    {
      "currency": "SGD",
      "unit": "1 KILOBAR",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD CERTIFICATE": [
    {
      "currency": "SGD",
      "unit": "1 KILOCERT",
      "bank_sells": "53,201.00 (+36.00)",
      "bank_buys": "52,933.00 (+36.00)"
    }
  ],
  "GOLD SAVINGS A/C": [
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "53.20 (+0.04)",
      "bank_buys": "52.94 (+0.04)"
    }
  ],
  "GOLD BULLION COINS": [
    {
      "currency": "SGD",
      "unit": "1/20 OZ(GNC,SLC &GML)",
      "bank_sells": "131.00",
      "bank_buys": "81.00"
    },
    {
      "currency": "SGD",
      "unit": "1/10 OZ",
      "bank_sells": "211.00 (+1.00)",
      "bank_buys": "163.00"
    },
    {
      "currency": "SGD",
      "unit": "1/4 OZ",
      "bank_sells": "465.00",
      "bank_buys": "410.00"
    },
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "904.00 (+1.00)",
      "bank_buys": "822.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,726.00 (+1.00)",
      "bank_buys": "1,645.00 (+1.00)"
    }
  ],
  "PAMP GOLD BARS": [
    {
      "currency": "SGD",
      "unit": "1/2 OZ",
      "bank_sells": "876.00",
      "bank_buys": "821.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "1 GM",
      "bank_sells": "82.00",
      "bank_buys": "50.00"
    },
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "1,711.00 (+1.00)",
      "bank_buys": "1,644.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "2.5 GM",
      "bank_sells": "182.00",
      "bank_buys": "130.00"
    },
    {
      "currency": "SGD",
      "unit": "5 GM",
      "bank_sells": "322.00",
      "bank_buys": "262.00"
    },
    {
      "currency": "SGD",
      "unit": "10 GM",
      "bank_sells": "597.00 (+1.00)",
      "bank_buys": "527.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "20 GM",
      "bank_sells": "1,132.00 (+1.00)",
      "bank_buys": "1,056.00 (+1.00)"
    },
    {
      "currency": "SGD",
      "unit": "50 GM",
      "bank_sells": "2,746.00 (+2.00)",
      "bank_buys": "2,644.00 (+2.00)"
    },
    {
      "currency": "SGD",
      "unit": "100 GM",
      "bank_sells": "5,414.00 (+3.00)",
      "bank_buys": "5,291.00 (+3.00)"
    }
  ],
  "SILVER PASSBOOK ACCOUNT": [
    {
      "currency": "SGD",
      "unit": "1 OZ",
      "bank_sells": "19.86 (+0.09)",
      "bank_buys": "19.30 (+0.09)"
    }
  ]
}

I believe this code will help you. If you want complete running project visit html to pdf Web scraping

import logging
import math
import json
from flask import jsonify, abort, make_response
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pydf

from constants import Constants
from response import Response


class SeleniumCrawler(object):

    def get_page(self, url):
        response = Response()
        try:

            # Initilized the chrome driver
            print("Initilized the chrome driver")
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1420,1080')
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--disable-gpu')
            browser = webdriver.Chrome(chrome_options=chrome_options)

            # browser url
            browser.get(url)
            delay = 10000

            # wait till specific classes appears
            print("wait till specific classes appears")
            WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'kbn-table')))
            body = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML')

            # calculate number of pages exists and loop them
            print("calculate number of pages exists and loop them")
            pages = (str(browser.find_element_by_class_name("kuiToolBarText").text).split(" ")[2]).replace(",","")
            pages = math.ceil(int(pages) / 50) - 1

            print("pages found {}".format(pages))
            for page in range(1, pages):

browser.execute_script("document.getElementsByClassName('kuiButton')[1].click()")
               chunk = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML').replace("<tbody>", "")
               body += chunk`enter code here`

            # apply table tags and generate pdf
            print("apply table tags and generate pdf")
            pdf = pydf.generate_pdf("<table>" + body + "</table>")
            with open('out.pdf', 'wb') as f:
                f.write(pdf)

            return json.loads(json.dumps((response.get_response(Constants.SUCCESS, Constants.SUCCESS))))
        except Exception as e:
            logging.exception(e)

            return abort(make_response(jsonify(response.get_response(Constants.SERVER_ERROR, Constants.SERVER_ERROR)), response.get_code(Constants.SERVER_ERROR)))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM