简体   繁体   中英

Scrape HTML using Beautifulsoup

I've been trying to extract the data from the table below using the following code.

Link , Wanted Data

test=[]

page = requests.get('http://www.thaibma.or.th/EN/BondInfo/BondFeature/Issue.aspx?symbol=ba891dbb-f614-e711-b77e-78e3b51dab3c')
soup = BeautifulSoup(page.text, 'html.parser')
finddata = soup.findAll('p')
for i in finddata:
    test.append(i.find(text=True))

print(test)

All of my wanted information is in tag "p" but when I printed this code, the text become blank.

Is there anyway / any other tools I could use to extract this data?

The website is loaded with JavaScript event which render it's data dynamically once the page loads.

requests library will not be able to render JavaScript on the fly. so you can use selenium or requests_html . and indeed there's a lot of modules which can do that.

Now, we do have another option on the table, to track from where the data is rendered. I were able to locate the XHR request which is used to retrieve the data from the back-end API and render it to the users side.

You can get the XHR request by open Developer-Tools and check Network and check XHR/JS requests made depending of the type of call such as fetch

import requests
from bs4 import BeautifulSoup
import json


with requests.Session() as req:
    r = req.get(
        "http://www.thaibma.or.th/EN/BondInfo/BondFeature/Issue.aspx?symbol=2dd6bca6-2543-ea11-a2f0-959434d0c31a")
    soup = BeautifulSoup(r.content, 'html.parser')
    token = soup.find("input", id="token").get("value")
    time = soup.find("input", id="time").get("value")
    headers = {
        'Token': token,
        'timestamp': time,
        'X-Requested-With': 'XMLHttpRequest',
        'Referer': 'http://www.thaibma.or.th/EN/BondInfo/BondFeature/Issue.aspx?symbol=2dd6bca6-2543-ea11-a2f0-959434d0c31a'
    }
    r = req.get(
        "http://www.thaibma.or.th/issue/feature?Symbol=2DD6BCA6-2543-EA11-A2F0-959434D0C31A", headers=headers).json()
    print(json.dumps(r, indent=4)) # to see the output in nice format.
    print("*" * 10)
    print(r.keys()) # you can access whatever as it's JSON dict now.

Output:

{
    "IssueID": "2dd6bca6-2543-ea11-a2f0-959434d0c31a",
    "IssueLegacyId": 76734,
    "Symbol": "BANPU20O22A",
    "SymbolTitle": "BANPU20O22A : Bill of Exchange of BANPU PUBLIC COMPANY LIMITED worth of 
THB 1,500.00 mln. due October 22, 2020 (BANPU20O22A)",
    "RegistrationDate": "2020-01-30T00:00:00",
    "IssueNameTh": "\u0e15\u0e31\u0e4b\u0e27\u0e41\u0e25\u0e01\u0e40\u0e07\u0e34\u0e19 \u0e1a\u0e23\u0e34\u0e29\u0e31\u0e17 \u0e1a\u0e49\u0e32\u0e19\u0e1b\u0e39 \u0e08\u0e33\u0e01\u0e31\u0e14 (\u0e21\u0e2b\u0e32\u0e0a\u0e19) \u0e21\u0e39\u0e25\u0e04\u0e48\u0e32 1,500.00 \u0e25\u0e49\u0e32\u0e19\u0e1a\u0e32\u0e17 \u0e04\u0e23\u0e1a\u0e01\u0e33\u0e2b\u0e19\u0e14\u0e44\u0e16\u0e48\u0e16\u0e2d\u0e19\u0e27\u0e31\u0e19\u0e17\u0e35\u0e48 22 \u0e15\u0e38\u0e25\u0e32\u0e04\u0e21 2563 (BANPU20O22A)",
    "IssueNameEn": "BANPU PUBLIC COMPANY LIMITED",
    "IsinTh": "0",
    "IsinEn": "0",
    "ClaimNameEn": "Senior",
    "SecureType": "Unsecured",
    "PrincipalPayment": "",
    "SustainabilityGoal": "",
    "CurrencyCode": "THB",
    "InitialPar": 1000.0,
    "CurrentPar": 1000.0,
    "IssueSize": 1500.0,
    "OutstandingSize": 1500.0,
    "IssuedDate": "2020-01-30T00:00:00",
    "MaturityDate": "2020-10-22T00:00:00",
    "IssueTerm": 0.7287671232876712,
    "CouponFrequencyNameEn": "At Maturity",
    "AccrualBasisNameEn": "Actual/365",
    "EmbbeddedOption": "-",
    "DistributionNameEn": "Institutional Investors",
    "CollateralRemark": "-",
    "IssueRemark": "Please be informed that the number shown in the \"Initial Par\" and \"Current Par\" do not represent the correct number.",
    "RiskLevelId": "6a8573d4-906a-ea11-a2f1-dca009a9f3d7",
    "RiskLevel": 3,
    "ProspectusId": null,
    "issuer_id": "ac90981d-e5f8-e111-93f5-78e3b51dab3c",
    "issuer_code": "BANPU"
}
**********
dict_keys(['IssueID', 'IssueLegacyId', 'Symbol', 'SymbolTitle', 'RegistrationDate', 'IssueNameTh', 'IssueNameEn', 'IsinTh', 'IsinEn', 'ClaimNameEn', 'SecureType', 'PrincipalPayment', 'SustainabilityGoal', 'CurrencyCode', 'InitialPar', 'CurrentPar', 'IssueSize', 'OutstandingSize', 'IssuedDate', 'MaturityDate', 'IssueTerm', 'CouponFrequencyNameEn', 'AccrualBasisNameEn', 'EmbbeddedOption', 'DistributionNameEn', 'CollateralRemark', 'IssueRemark', 'RiskLevelId', 'RiskLevel', 'ProspectusId', 'issuer_id', 'issuer_code'])

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM